diff --git a/Project.toml b/Project.toml index 4b1b864090..85abbabaca 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TrixiParticles" uuid = "66699cd8-9c01-4e9d-a059-b96c86d16b3a" -authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"] version = "0.4.5-dev" +authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"] [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" @@ -18,7 +18,6 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221" PointNeighbors = "1c4d5385-0a27-49de-8e2c-43b175c8985c" Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -57,7 +56,6 @@ ForwardDiff = "1" GPUArraysCore = "0.2" JSON = "1" KernelAbstractions = "0.9" -MuladdMacro = "0.2" OrdinaryDiffEq = "6.91" OrdinaryDiffEqCore = "2, 3" PointNeighbors = "0.6.5" diff --git a/docs/literate/src/tut_custom_kernel.jl b/docs/literate/src/tut_custom_kernel.jl index 25a3bed683..053f0c74a0 100644 --- a/docs/literate/src/tut_custom_kernel.jl +++ b/docs/literate/src/tut_custom_kernel.jl @@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end # By looking at the implementation of existing kernels in TrixiParticles.jl, # we can see that a kernel implementation requires three functions. -# `TrixiParticles.kernel`, which is the kernel function itself, -# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function, -# and `TrixiParticles.compact_support`, which defines the compact support of the -# kernel in relation to the smoothing length. +# `TrixiParticles.kernel_unsafe`, which is the kernel function itself, +# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the +# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines +# the compact support of the kernel in relation to the smoothing length. # The latter is relevant for determining the search radius of the neighborhood search. -function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h) +# +# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since +# this avoids an extra division in the hot loop and is robust near ``r=0``. +# The public function `TrixiParticles.kernel_deriv` is defined automatically by +# TrixiParticles from this method (and multiplies by ``r`` again when needed). +# In the unsafe functions, we do not check the compact support; this is handled in the +# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the +# compact support defined in `TrixiParticles.compact_support`. +function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h) q = r / h - if q < 2 - return 1 / (pi * h^2) * exp(-q^2) - end - - return 0.0 + return 1 / (pi * h^2) * exp(-q^2) end -function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h) +function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h) q = r / h - if q < 2 - return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h - end - - return 0.0 + kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h + return kernel_deriv / r end TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h diff --git a/docs/make.jl b/docs/make.jl index 6863c72fb1..3c04970b77 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -72,7 +72,8 @@ makedocs(sitename="TrixiParticles.jl", plugins=[bib], # Run doctests and check docs for the following modules modules=[TrixiParticles, TrixiBase], - format=Documenter.HTML(; assets=Asciicast.assets()), + # Set edit_link explicitly to avoid `git remote show origin` lookups. + format=Documenter.HTML(; assets=Asciicast.assets(), edit_link="main"), # Explicitly specify documentation structure pages=[ "Home" => "index.md", diff --git a/docs/src/general/smoothing_kernels.md b/docs/src/general/smoothing_kernels.md index 6e68b1152d..ea36cbf8aa 100644 --- a/docs/src/general/smoothing_kernels.md +++ b/docs/src/general/smoothing_kernels.md @@ -16,108 +16,107 @@ The following smoothing kernels are currently available: Any Kernel with a stability rating of more than '+++' doesn't suffer from pairing-instability. -We recommend to use the [`WendlandC2Kernel`](@ref) for most applications. +We recommend using the [`WendlandC2Kernel`](@ref) for most applications. If less smoothing is needed, try [`SchoenbergCubicSplineKernel`](@ref), for more smoothing try [`WendlandC6Kernel`](@ref). ```@eval - -using TrixiParticles -using CairoMakie - -# --- Group the kernels for combined plotting --- -wendland_kernels = [ - ("Wendland C2", WendlandC2Kernel{2}()), - ("Wendland C4", WendlandC4Kernel{2}()), - ("Wendland C6", WendlandC6Kernel{2}()), -] - -schoenberg_kernels = [ - ("Cubic Spline", SchoenbergCubicSplineKernel{2}()), - ("Quartic Spline", SchoenbergQuarticSplineKernel{2}()), - ("Quintic Spline", SchoenbergQuinticSplineKernel{2}()), -] - -other_kernels = [ - ("Gaussian", GaussianKernel{2}()), - ("Poly6", Poly6Kernel{2}()), - ("Laguerre-Gauss", LaguerreGaussKernel{2}()), -] - -spiky_kernel_group = [ - ("Spiky Kernel", SpikyKernel{2}()), -] - -# A list of all kernel groups to be plotted -# A boolean flag controls whether to apply the consistent y-range -kernel_groups = [ - (title="Wendland Kernels", kernels=wendland_kernels, use_consistent_range=true), - (title="Schoenberg Spline Kernels", kernels=schoenberg_kernels, use_consistent_range=true), - (title="Other Kernels", kernels=other_kernels, use_consistent_range=true), - (title="Spiky Kernel", kernels=spiky_kernel_group, use_consistent_range=false), -] - -# --- Pre-calculate global y-ranges for consistency --- -kernels_for_range_calc = vcat(wendland_kernels, schoenberg_kernels, other_kernels) - -q_range = range(0, 3, length=300) -h = 1.0 -min_val, max_val = Inf, -Inf -min_deriv, max_deriv = Inf, -Inf - -for (_, kernel_obj) in kernels_for_range_calc - kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range] +using TrixiParticles +using CairoMakie + +# --- Group the kernels for combined plotting --- +wendland_kernels = [ + ("Wendland C2", WendlandC2Kernel{2}()), + ("Wendland C4", WendlandC4Kernel{2}()), + ("Wendland C6", WendlandC6Kernel{2}()), +] + +schoenberg_kernels = [ + ("Cubic Spline", SchoenbergCubicSplineKernel{2}()), + ("Quartic Spline", SchoenbergQuarticSplineKernel{2}()), + ("Quintic Spline", SchoenbergQuinticSplineKernel{2}()), +] + +other_kernels = [ + ("Gaussian", GaussianKernel{2}()), + ("Poly6", Poly6Kernel{2}()), + ("Laguerre-Gauss", LaguerreGaussKernel{2}()), +] + +spiky_kernel_group = [ + ("Spiky Kernel", SpikyKernel{2}()), +] + +# A list of all kernel groups to be plotted +# A boolean flag controls whether to apply the consistent y-range +kernel_groups = [ + (title="Wendland Kernels", kernels=wendland_kernels, use_consistent_range=true), + (title="Schoenberg Spline Kernels", kernels=schoenberg_kernels, use_consistent_range=true), + (title="Other Kernels", kernels=other_kernels, use_consistent_range=true), + (title="Spiky Kernel", kernels=spiky_kernel_group, use_consistent_range=false), +] + +# --- Pre-calculate global y-ranges for consistency --- +kernels_for_range_calc = vcat(wendland_kernels, schoenberg_kernels, other_kernels) + +q_range = range(0, 3, length=300) +h = 1.0 +min_val, max_val = Inf, -Inf +min_deriv, max_deriv = Inf, -Inf + +for (_, kernel_obj) in kernels_for_range_calc + kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range] kernel_derivs = [TrixiParticles.kernel_deriv(kernel_obj, q, h) for q in q_range] - global min_val = min(min_val, minimum(kernel_values)) - global max_val = max(max_val, maximum(kernel_values)) - global min_deriv = min(min_deriv, minimum(kernel_derivs)) - global max_deriv = max(max_deriv, maximum(kernel_derivs)) -end + global min_val = min(min_val, minimum(kernel_values)) + global max_val = max(max_val, maximum(kernel_values)) + global min_deriv = min(min_deriv, minimum(kernel_derivs)) + global max_deriv = max(max_deriv, maximum(kernel_derivs)) +end -# Add 10% padding to the y-limits for better visuals -y_range_val = (min_val - 0.1 * (max_val - min_val), - max_val + 0.1 * (max_val - min_val)) -y_range_deriv = (min_deriv - 0.1 * (max_deriv - min_deriv), - max_deriv + 0.1 * (max_deriv - min_deriv)) +# Add 10% padding to the y-limits for better visuals +y_range_val = (min_val - 0.1 * (max_val - min_val), + max_val + 0.1 * (max_val - min_val)) +y_range_deriv = (min_deriv - 0.1 * (max_deriv - min_deriv), + max_deriv + 0.1 * (max_deriv - min_deriv)) -fig = Figure(size = (1000, 1200), fontsize=16) +fig = Figure(size = (1000, 1200), fontsize=16) -for (i, group) in enumerate(kernel_groups) - ax_val = Axis(fig[i, 1], - xlabel = "q = r/h", ylabel = "w(q)", - title = group.title) +for (i, group) in enumerate(kernel_groups) + ax_val = Axis(fig[i, 1], + xlabel = "q = r/h", ylabel = "w(q)", + title = group.title) - ax_deriv = Axis(fig[i, 2], - xlabel = "q = r/h", ylabel = "w'(q)", - title = group.title) + ax_deriv = Axis(fig[i, 2], + xlabel = "q = r/h", ylabel = "w'(q)", + title = group.title) - if group.use_consistent_range - ylims!(ax_val, y_range_val) - ylims!(ax_deriv, y_range_deriv) - end + if group.use_consistent_range + ylims!(ax_val, y_range_val) + ylims!(ax_deriv, y_range_deriv) + end - hlines!(ax_val, [0.0], linestyle = :dash) - hlines!(ax_deriv, [0.0], linestyle = :dash) + hlines!(ax_val, [0.0], linestyle = :dash) + hlines!(ax_deriv, [0.0], linestyle = :dash) - for (name, kernel_obj) in group.kernels - kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range] - kernel_derivs = [TrixiParticles.kernel_deriv(kernel_obj, q, h) for q in q_range] + for (name, kernel_obj) in group.kernels + kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range] + kernel_derivs = [TrixiParticles.kernel_deriv(kernel_obj, q, h) for q in q_range] - lines!(ax_val, q_range, kernel_values, label=name, linewidth=2.5) - lines!(ax_deriv, q_range, kernel_derivs, label=name, linewidth=2.5) - end + lines!(ax_val, q_range, kernel_values, label=name, linewidth=2.5) + lines!(ax_deriv, q_range, kernel_derivs, label=name, linewidth=2.5) + end - axislegend(ax_val, position = :rt) - axislegend(ax_deriv, position = :rt) -end + axislegend(ax_val, position = :rt) + axislegend(ax_deriv, position = :rt) +end -# Add row gaps between the 4 rows (3 gaps total) -for i in 1:(length(kernel_groups) - 1) - rowgap!(fig.layout, i, 25) -end +# Add row gaps between the 4 rows (3 gaps total) +for i in 1:(length(kernel_groups) - 1) + rowgap!(fig.layout, i, 25) +end CairoMakie.save("smoothing_kernels.png", fig) - + ``` ![Radial profiles and derivatives of the available smoothing kernels](smoothing_kernels.png) @@ -125,18 +124,24 @@ CairoMakie.save("smoothing_kernels.png", fig) !!! note "Usage" The kernel can be called as - ``` + ```jldoctest kernels; output = false, setup = :(using TrixiParticles, LinearAlgebra; smoothing_kernel = WendlandC2Kernel{2}(); h = 1.0; r = 0.5) TrixiParticles.kernel(smoothing_kernel, r, h) + + # output + 0.35250333098869013 ``` The length of the compact support can be obtained as - ``` + ```jldoctest kernels; output = false TrixiParticles.compact_support(smoothing_kernel, h) + + # output + 2.0 ``` Note that ``r`` has to be a scalar, so in the context of SPH, the kernel should be used as ```math - W(\Vert r_a - r_b \Vert, h). + W(\Vert r_a - r_b \Vert, h). ``` The gradient required in SPH, @@ -144,10 +149,61 @@ CairoMakie.save("smoothing_kernels.png", fig) \nabla_{r_a} W(\Vert r_a - r_b \Vert, h) ``` can be called as - ``` + ```jldoctest kernels; output = false, setup = :(pos_diff = SVector(0.5, 0.5); distance = norm(pos_diff)) TrixiParticles.kernel_grad(smoothing_kernel, pos_diff, distance, h) + + # output + 2-element SVector{2, Float64} with indices SOneTo(2): + -0.3762063922043116 + -0.3762063922043116 + ``` + where `pos_diff` is $r_a - r_b$ and `distance` is $\Vert r_a - r_b \Vert$, + although in most cases, it is recommended to use + ```jldoctest kernels; output = false, setup=:(trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "hydrostatic_water_column_2d.jl"), sol=nothing); system = fluid_system; particle = 1) + TrixiParticles.smoothing_kernel_grad(system, pos_diff, distance, particle) + + # output + 2-element SVector{2, Float64} with indices SOneTo(2): + 0.0 + 0.0 + ``` + instead, which will also take into account if the system is using a corrected kernel. + + The equivalent function to obtain the kernel value is + ```jldoctest kernels; output = false + TrixiParticles.smoothing_kernel(system, distance, particle) + + # output + 0.0 + ``` + + In performance-critical code, it is recommended to use the `_unsafe` versions of these + functions, which skip the compact support and zero distance checks: + ```jldoctest kernels; output = false + TrixiParticles.smoothing_kernel_unsafe(system, distance, particle) + TrixiParticles.smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) + + # output + 2-element SVector{2, Float64} with indices SOneTo(2): + -106899.65266169122 + -106899.65266169122 + ``` + These functions are only safe to use if compact support and zero distance have already + been checked. + The compact support is automatically checked when inside a `foreach_point_neighbor` + or `foreach_neighbor` loop. + For the zero distance check, use + ```jldoctest kernels; output = false, setup = :(almostzero = 1e-12) + TrixiParticles.skip_zero_distance(system) && distance < almostzero + + # output + false ``` - where `pos_diff` is $r_a - r_b$ and `distance` is $\Vert r_a - r_b \Vert$. + where `skip_zero_distance` returns `true` if the kernel gradient is zero at zero + distance, which is not the case for some corrected kernels. + Here, `almostzero` is a small threshold, which should be chosen as `sqrt(eps(h^2))`, + where `h` is the smoothing length, since `distance^2` is in the order of `h^2`. + Note that `sqrt(eps(h^2)) != eps(h)`. ```@autodocs Modules = [TrixiParticles] diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index faa1fd2956..215a2b3bf1 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -17,7 +17,6 @@ using GPUArraysCore: AbstractGPUArray using JSON: JSON using KernelAbstractions: KernelAbstractions, @kernel, @index using LinearAlgebra: norm, normalize, cross, dot, I, tr, inv, pinv, det -using MuladdMacro: @muladd using Polyester: Polyester, @batch using Printf: @printf, @sprintf using ReadVTK: ReadVTK diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 10ccb6261f..fde3b92a7b 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -130,10 +130,36 @@ end return kernel(smoothing_kernel, distance, smoothing_length(system, particle)) end +@inline function smoothing_kernel_unsafe(system, distance, particle) + (; smoothing_kernel) = system + return kernel_unsafe(smoothing_kernel, distance, smoothing_length(system, particle)) +end + +@inline function skip_zero_distance(system::AbstractSystem) + return skip_zero_distance(system_correction(system)) +end + +# Robust/safe version of the function below. In performance-critical code, manually check +# the kernel support, call `skip_zero_distance` and then `smoothing_kernel_grad_unsafe`. @inline function smoothing_kernel_grad(system, pos_diff, distance, particle) - return corrected_kernel_grad(system_smoothing_kernel(system), pos_diff, - distance, smoothing_length(system, particle), - system_correction(system), system, particle) + h = smoothing_length(system, particle) + compact_support_ = compact_support(system_smoothing_kernel(system), h) + + # Note that `sqrt(eps(h^2)) != eps(h)` + if distance >= compact_support_ || + (skip_zero_distance(system) && distance^2 < eps(h^2)) + return zero(pos_diff) + end + + return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) +end + +# Skip the zero distance and compact support checks for maximum performance. +# Only call this when you are sure that `0 < distance < compact_support`. +@inline function smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) + return corrected_kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff, + distance, smoothing_length(system, particle), + system_correction(system), system, particle) end # System updates do nothing by default, but can be dispatched if needed diff --git a/src/general/corrections.jl b/src/general/corrections.jl index 292523feb8..77f4fae0dd 100644 --- a/src/general/corrections.jl +++ b/src/general/corrections.jl @@ -204,6 +204,15 @@ function compute_correction_values!(system, neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff foreach_point_neighbor(system, neighbor_system, system_coords, neighbor_coords, semi) do particle, neighbor, pos_diff, distance @@ -217,10 +226,12 @@ function compute_correction_values!(system, kernel_correction_coefficient[particle] += volume * W - # Only consider particles with a distance > 0. See `src/general/smoothing_kernels.jl` for more details. - if distance^2 > eps(initial_smoothing_length(system)^2) - grad_W = kernel_grad(system_smoothing_kernel(system), pos_diff, distance, - smoothing_length(system, particle)) + # Only consider particles with a distance > 0. + if distance > almostzero + # Now that we know that `distance` is not zero, we can safely call the + # unsafe version of the kernel gradient to avoid redundant zero checks. + grad_W = kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff, + distance, smoothing_length(system, particle)) tmp = volume * grad_W for i in axes(dw_gamma, 1) dw_gamma[i, particle] += tmp[i] @@ -337,31 +348,36 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system, v_neighbor_system = wrap_v(v_ode, neighbor_system, semi) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + almostzero = sqrt(eps(compact_support(system, neighbor_system)^2)) foreach_point_neighbor(system, neighbor_system, coordinates, neighbor_coords, semi) do particle, neighbor, pos_diff, distance - volume = hydrodynamic_mass(neighbor_system, neighbor) / - current_density(v_neighbor_system, neighbor_system, neighbor) - smoothing_length_ = smoothing_length(system, particle) - - function compute_grad_kernel(correction, smoothing_kernel, pos_diff, distance, - smoothing_length_, system, particle) - return smoothing_kernel_grad(system, pos_diff, distance, particle) + function kernel_grad_local(correction, smoothing_kernel, pos_diff, distance, + smoothing_length_, system, particle) + return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) end # Compute gradient of corrected kernel - function compute_grad_kernel(correction::MixedKernelGradientCorrection, - smoothing_kernel, pos_diff, distance, - smoothing_length_, system, particle) - return corrected_kernel_grad(smoothing_kernel, pos_diff, distance, - smoothing_length_, KernelCorrection(), system, - particle) + function kernel_grad_local(correction::MixedKernelGradientCorrection, + smoothing_kernel, pos_diff, distance, + smoothing_length_, system, particle) + return corrected_kernel_grad_unsafe(smoothing_kernel, pos_diff, distance, + smoothing_length_, KernelCorrection(), + system, particle) end - grad_kernel = compute_grad_kernel(correction, smoothing_kernel, pos_diff, - distance, smoothing_length_, system, particle) + # Skip neighbors with the same position if the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(correction) && distance < almostzero && return - iszero(grad_kernel) && return + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + smoothing_length_ = smoothing_length(system, particle) + grad_kernel = kernel_grad_local(correction, smoothing_kernel, pos_diff, + distance, smoothing_length_, system, particle) + + volume = hydrodynamic_mass(neighbor_system, neighbor) / + current_density(v_neighbor_system, neighbor_system, neighbor) L = volume * grad_kernel * pos_diff' diff --git a/src/general/smoothing_kernels.jl b/src/general/smoothing_kernels.jl index 4a97564648..456d55a13f 100644 --- a/src/general/smoothing_kernels.jl +++ b/src/general/smoothing_kernels.jl @@ -9,44 +9,86 @@ abstract type AbstractSmoothingKernel{NDIMS} end # `distance^2` is in the order of `h^2`, hence the comparison `distance^2 < eps(h^2)`. # Note that this is faster than `distance < sqrt(eps(h^2))`. # Also note that `sqrt(eps(h^2)) != eps(h)`. - distance^2 < eps(h^2) && return zero(pos_diff) + compact_support_ = compact_support(kernel, h) + nonzero = distance < compact_support_ && distance^2 > eps(h^2) + nonzero || return zero(pos_diff) - return kernel_deriv(kernel, distance, h) / distance * pos_diff + # Now we can use `kernel_grad_unsafe` without worrying about division by zero + return kernel_grad_unsafe(kernel, pos_diff, distance, h) end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, correction, system, - particle) - return kernel_grad(kernel, pos_diff, distance, h) +# Skip the zero distance and compact support checks for maximum performance. +# Only call this when you are sure that `distance` is not zero and within the compact support. +@inline function kernel_grad_unsafe(kernel, pos_diff, distance, h) + # Call an optimized version of the kernel derivative, skipping redundant checks. + # Since this is used as `kernel_deriv / distance * pos_diff`, we already divide by `r` + # (or rather skip a multiplication by `r`) inside `kernel_deriv_div_r_unsafe` + # to save expensive divisions. + return kernel_deriv_div_r_unsafe(kernel, distance, h) * pos_diff end -@inline function corrected_kernel_grad(kernel_, pos_diff, distance, h, ::KernelCorrection, - system, particle) +@inline function kernel(kernel, r::Real, h) + # Zero out result if outside of compact support + compact_support_ = compact_support(kernel, h) + return ifelse(r < compact_support_, kernel_unsafe(kernel, r, h), zero(r)) +end + +@inline function kernel_deriv(kernel, r::Real, h) + # Zero out result if outside of compact support or if `r` is almost zero + # (to avoid division by zero in the unsafe version). + compact_support_ = compact_support(kernel, h) + if r < compact_support_ && r^2 > eps(h^2) + # The unsafe version returns the kernel derivative divided by `r`, + # so we multiply it by `r` to get the actual derivative. + return kernel_deriv_div_r_unsafe(kernel, r, h) * r + end + + return zero(r) +end + +@inline function skip_zero_distance(correction) + return true +end + +@inline function skip_zero_distance(::Union{KernelCorrection, + MixedKernelGradientCorrection}) + # For these corrections, the kernel gradient between a particle and itself is not zero + return false +end + +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, correction, + system, particle) + return kernel_grad_unsafe(kernel, pos_diff, distance, h) +end + +@inline function corrected_kernel_grad_unsafe(kernel_, pos_diff, distance, h, + ::KernelCorrection, system, particle) return (kernel_grad(kernel_, pos_diff, distance, h) .- kernel(kernel_, distance, h) * dw_gamma(system, particle)) / kernel_correction_coefficient(system, particle) end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, - corr::BlendedGradientCorrection, system, - particle) +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, + corr::BlendedGradientCorrection, system, + particle) (; blending_factor) = corr - grad = kernel_grad(kernel, pos_diff, distance, h) + grad = kernel_grad_unsafe(kernel, pos_diff, distance, h) return (1 - blending_factor) * grad + blending_factor * correction_matrix(system, particle) * grad end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, - ::GradientCorrection, system, particle) - grad = kernel_grad(kernel, pos_diff, distance, h) +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, + ::GradientCorrection, system, particle) + grad = kernel_grad_unsafe(kernel, pos_diff, distance, h) return correction_matrix(system, particle) * grad end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, - ::MixedKernelGradientCorrection, system, - particle) - grad = corrected_kernel_grad(kernel, pos_diff, distance, h, KernelCorrection(), - system, particle) +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, + ::MixedKernelGradientCorrection, system, + particle) + grad = corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, KernelCorrection(), + system, particle) return correction_matrix(system, particle) * grad end @@ -80,32 +122,40 @@ which is beneficial in regards to stability but makes it less accurate. """ struct GaussianKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@inline @fastmath function kernel(kernel::GaussianKernel, r::Real, h) - q = r / h +@inline function kernel_unsafe(kernel::GaussianKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) - # Zero out result if q >= 3 - result = ifelse(q < 3, normalization_factor(kernel, h) * exp(-q^2), zero(q)) - - return result + return normalization_factor(kernel, h_inv) * exp_q2 end -@inline @fastmath function kernel_deriv(kernel::GaussianKernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv - - # Zero out result if q >= 3 - result = ifelse(q < 3, - -2 * q * normalization_factor(kernel, h) * exp(-q^2) * inner_deriv, - zero(q)) +@inline function kernel_deriv_div_r_unsafe(kernel::GaussianKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) - return result + return -2 * normalization_factor(kernel, h_inv) * exp_q2 * h_inv^2 end @inline compact_support(::GaussianKernel, h) = 3 * h -@inline normalization_factor(::GaussianKernel{2}, h) = 1 / (pi * h^2) -# First convert `pi` to the type of `h` to preserve the type of `h` -@inline normalization_factor(::GaussianKernel{3}, h) = 1 / (oftype(h, pi)^(3 // 2) * h^3) +@inline function normalization_factor(::GaussianKernel{2}, h_inv) + return oftype(h_inv, 1 / pi) * h_inv^2 +end + +@inline function normalization_factor(::GaussianKernel{3}, h_inv) + # 1 / (pi^1.5 * h^3) + return oftype(h_inv, 0.17958712212516656) * h_inv^3 +end @doc raw""" SchoenbergCubicSplineKernel{NDIMS}() @@ -140,40 +190,41 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct SchoenbergCubicSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@muladd @inline function kernel(kernel::SchoenbergCubicSplineKernel, r::Real, h) - q = r / h - - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl. - result = 1 * (2 - q)^3 / 4 - result = result - (q < 1) * (1 - q)^3 +@inline function kernel_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(result)) + result = 1 * (2 - q)^3 / 4 - (q < 1) * (1 - q)^3 - return result + return normalization_factor(kernel, h_inv) * result end -@muladd @inline function kernel_deriv(kernel::SchoenbergCubicSplineKernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = -3 * (2 - q)^2 / 4 - result = result + 3 * (q < 1) * (1 - q)^2 + result = -3 * (2 - q)^2 / 4 + 3 * (q < 1) * (1 - q)^2 + kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result * inner_deriv, - zero(result)) - - return result + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + # See the docs page "Development" for more details on `div_fast`. + return div_fast(kernel_deriv, r) end @inline compact_support(::SchoenbergCubicSplineKernel, h) = 2 * h -# Note that `2 // 3 / h` saves one instruction but is significantly slower on GPUs (for now) -@inline normalization_factor(::SchoenbergCubicSplineKernel{1}, h) = 2 / (3 * h) -@inline normalization_factor(::SchoenbergCubicSplineKernel{2}, h) = 10 / (pi * h^2 * 7) -@inline normalization_factor(::SchoenbergCubicSplineKernel{3}, h) = 1 / (pi * h^3) +@inline function normalization_factor(::SchoenbergCubicSplineKernel{1}, h_inv) + return oftype(h_inv, 2 / 3) * h_inv +end + +@inline function normalization_factor(::SchoenbergCubicSplineKernel{2}, h_inv) + return oftype(h_inv, 10 / (pi * 7)) * h_inv^2 +end + +@inline function normalization_factor(::SchoenbergCubicSplineKernel{3}, h_inv) + return oftype(h_inv, 1 / pi) * h_inv^3 +end @doc raw""" SchoenbergQuarticSplineKernel{NDIMS}() @@ -217,54 +268,58 @@ struct SchoenbergQuarticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} en # to a power of three, see # https://github.com/JuliaLang/julia/blob/34934736fa4dcb30697ac1b23d11d5ad394d6a4d/base/intfuncs.jl#L327-L339 # By using the `@fastpow` macro, we are consciously trading off some precision in the result -# for enhanced computational speed. This is especially useful in scenarios where performance -# is a higher priority than exact precision. -@fastpow @muladd @inline function kernel(kernel::SchoenbergQuarticSplineKernel, r::Real, h) - q = r / h +# for enhanced computational speed by using simple multiplications for higher powers. +@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuarticSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Preserve the type of `q` - q5_2 = (5 // 2 - q) - q3_2 = (3 // 2 - q) - q1_2 = (1 // 2 - q) + # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason + q5_2 = 2.5f0 - q + q3_2 = 1.5f0 - q + q1_2 = 0.5f0 - q - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = q5_2^4 - result = result - 5 * (q < 3 // 2) * q3_2^4 - result = result + 10 * (q < 1 // 2) * q1_2^4 + # Note: `q < 3 // 2` is not working on GPUs. + # See https://github.com/JuliaGPU/CUDA.jl/issues/2681. + result = q5_2^4 - 5 * (q < 1.5f0) * q3_2^4 + 10 * (q < 0.5f0) * q1_2^4 - # Zero out result if q >= 5/2 - result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result, zero(result)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuarticSplineKernel, - r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuarticSplineKernel, + r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Preserve the type of `q` - q5_2 = 5 // 2 - q - q3_2 = 3 // 2 - q - q1_2 = 1 // 2 - q + # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason + q5_2 = 2.5f0 - q + q3_2 = 1.5f0 - q + q1_2 = 0.5f0 - q - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = -4 * q5_2^3 - result = result + 20 * (q < 3 // 2) * q3_2^3 - result = result - 40 * (q < 1 // 2) * q1_2^3 + # Note: `q < 3 // 2` is not working on GPUs. + # See https://github.com/JuliaGPU/CUDA.jl/issues/2681. + result = -4 * q5_2^3 + 20 * (q < 1.5f0) * q3_2^3 - 40 * (q < 0.5f0) * q1_2^3 - # Zero out result if q >= 5/2 - result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result * inner_deriv, - zero(result)) + kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv - return result + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + # See the docs page "Development" for more details on `div_fast`. + return div_fast(kernel_deriv, r) end -@inline compact_support(::SchoenbergQuarticSplineKernel, h) = 5 * h / 2 +@inline compact_support(::SchoenbergQuarticSplineKernel, h) = oftype(h, 5 / 2) * h -@inline normalization_factor(::SchoenbergQuarticSplineKernel{1}, h) = 1 / (24 * h) -@inline normalization_factor(::SchoenbergQuarticSplineKernel{2}, h) = 96 / (pi * h^2 * 1199) -@inline normalization_factor(::SchoenbergQuarticSplineKernel{3}, h) = 1 / (pi * h^3 * 20) +@inline function normalization_factor(::SchoenbergQuarticSplineKernel{1}, h_inv) + return oftype(h_inv, 1 / 24) * h_inv +end + +@inline function normalization_factor(::SchoenbergQuarticSplineKernel{2}, h_inv) + return oftype(h_inv, 96 / (pi * 1199)) * h_inv^2 +end + +@inline function normalization_factor(::SchoenbergQuarticSplineKernel{3}, h_inv) + return oftype(h_inv, 1 / (pi * 20)) * h_inv^3 +end @doc raw""" SchoenbergQuinticSplineKernel{NDIMS}() @@ -301,48 +356,43 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct SchoenbergQuinticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@fastpow @muladd @inline function kernel(kernel::SchoenbergQuinticSplineKernel, r::Real, h) - q = r / h - q3 = (3 - q) - q2 = (2 - q) - q1 = (1 - q) +@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuinticSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = q3^5 - result = result - 6 * (q < 2) * q2^5 - result = result + 15 * (q < 1) * q1^5 + result = (3 - q)^5 - 6 * (q < 2) * (2 - q)^5 + 15 * (q < 1) * (1 - q)^5 - # Zero out result if q >= 3 - result = ifelse(q < 3, normalization_factor(kernel, h) * result, zero(result)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuinticSplineKernel, - r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv - q3 = (3 - q) - q2 = (2 - q) - q1 = (1 - q) +@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuinticSplineKernel, + r::Real, h) + h_inv = 1 / h + q = r * h_inv - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = -5 * q3^4 - result = result + 30 * (q < 2) * q2^4 - result = result - 75 * (q < 1) * q1^4 + result = -5 * (3 - q)^4 + 30 * (q < 2) * (2 - q)^4 - 75 * (q < 1) * (1 - q)^4 - # Zero out result if q >= 3 - result = ifelse(q < 3, normalization_factor(kernel, h) * result * inner_deriv, - zero(result)) + kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv - return result + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + # See the docs page "Development" for more details on `div_fast`. + return div_fast(kernel_deriv, r) end @inline compact_support(::SchoenbergQuinticSplineKernel, h) = 3 * h -@inline normalization_factor(::SchoenbergQuinticSplineKernel{1}, h) = 1 / (120 * h) -@inline normalization_factor(::SchoenbergQuinticSplineKernel{2}, h) = 7 / (pi * h^2 * 478) -@inline normalization_factor(::SchoenbergQuinticSplineKernel{3}, h) = 1 / (pi * h^3 * 120) +@inline function normalization_factor(::SchoenbergQuinticSplineKernel{1}, h_inv) + return oftype(h_inv, 1 / (120)) * h_inv +end + +@inline function normalization_factor(::SchoenbergQuinticSplineKernel{2}, h_inv) + return oftype(h_inv, 7 / (pi * 478)) * h_inv^2 +end + +@inline function normalization_factor(::SchoenbergQuinticSplineKernel{3}, h_inv) + return oftype(h_inv, 1 / (pi * 120)) * h_inv^3 +end abstract type AbstractWendlandKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end @@ -383,33 +433,26 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct WendlandC2Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end -@fastpow @inline function kernel(kernel::WendlandC2Kernel, r::Real, h) - q = r / h - - result = (1 - q / 2)^4 * (2 * q + 1) +@fastpow @inline function kernel_unsafe(kernel::WendlandC2Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * (1 - q / 2)^4 * (2 * q + 1) end -@inline function kernel_deriv(kernel::WendlandC2Kernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv - - result = -5 * (1 - q / 2)^3 * q +@inline function kernel_deriv_div_r_unsafe(kernel::WendlandC2Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, - normalization_factor(kernel, h) * result * inner_deriv, zero(q)) - - return result + return -5 * normalization_factor(kernel, h_inv) * (1 - q / 2)^3 * h_inv^2 end -# Note that `7 // 4` saves one instruction but is significantly slower on GPUs (for now) -@inline normalization_factor(::WendlandC2Kernel{2}, h) = 7 / (pi * h^2 * 4) -@inline normalization_factor(::WendlandC2Kernel{3}, h) = 21 / (pi * h^3 * 16) +@inline function normalization_factor(::WendlandC2Kernel{2}, h_inv) + return oftype(h_inv, 7 / (4 * pi)) * h_inv^2 +end +@inline function normalization_factor(::WendlandC2Kernel{3}, h_inv) + return oftype(h_inv, 21 / (16 * pi)) * h_inv^3 +end @doc raw""" WendlandC4Kernel{NDIMS}() @@ -445,31 +488,30 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct WendlandC4Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end -@fastpow @inline function kernel(kernel::WendlandC4Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_unsafe(kernel::WendlandC4Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv result = (1 - q / 2)^6 * (35 * q^2 / 12 + 3 * q + 1) - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @inline function kernel_deriv(kernel::WendlandC4Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC4Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - derivative = -7 * q / 3 * (2 + 5 * q) * (1 - q / 2)^5 + result = oftype(r, -7 / 3) * (2 + 5 * q) * (1 - q / 2)^5 * h_inv^2 - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h, - zero(derivative)) - - return result + return normalization_factor(kernel, h_inv) * result end -@inline normalization_factor(::WendlandC4Kernel{2}, h) = 9 / (pi * h^2 * 4) -@inline normalization_factor(::WendlandC4Kernel{3}, h) = 495 / (pi * h^3 * 256) +@inline function normalization_factor(::WendlandC4Kernel{2}, h_inv) + return oftype(h_inv, 9 / (pi * 4)) * h_inv^2 +end +@inline function normalization_factor(::WendlandC4Kernel{3}, h_inv) + return oftype(h_inv, 495 / (pi * 256)) * h_inv^3 +end @doc raw""" WendlandC6Kernel{NDIMS}() @@ -505,31 +547,31 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct WendlandC6Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end -@fastpow @inline function kernel(kernel::WendlandC6Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_unsafe(kernel::WendlandC6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv result = (1 - q / 2)^8 * (4 * q^3 + 25 * q^2 / 4 + 4 * q + 1) - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @muladd @inline function kernel_deriv(kernel::WendlandC6Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - derivative = -11 * q / 4 * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7 + result = oftype(r, -11 / 4) * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7 * h_inv^2 - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h, - zero(derivative)) + return normalization_factor(kernel, h_inv) * result +end - return result +@inline function normalization_factor(::WendlandC6Kernel{2}, h_inv) + return oftype(h_inv, 39 / (pi * 14)) * h_inv^2 end -@inline normalization_factor(::WendlandC6Kernel{2}, h) = 39 / (pi * h^2 * 14) -@inline normalization_factor(::WendlandC6Kernel{3}, h) = 1365 / (pi * h^3 * 512) +@inline function normalization_factor(::WendlandC6Kernel{3}, h_inv) + return oftype(h_inv, 1365 / (pi * 512)) * h_inv^3 +end @doc raw""" Poly6Kernel{NDIMS}() @@ -541,7 +583,6 @@ especially in computer graphics contexts. It is defined as W(r, h) = \frac{1}{h^d} w(r/h) ``` - with ```math @@ -569,34 +610,34 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct Poly6Kernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@inline function kernel(kernel::Poly6Kernel, r::Real, h) - q = r / h - - result = (1 - q^2)^3 +@inline function kernel_unsafe(kernel::Poly6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 1 - result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * (1 - q^2)^3 end -@inline function kernel_deriv(kernel::Poly6Kernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::Poly6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - result = -6 * q * (1 - q^2)^2 + kernel_deriv = normalization_factor(kernel, h_inv) * -6 * q * (1 - q^2)^2 * h_inv - # Zero out result if q >= 1 - result = ifelse(q < 1, - result * normalization_factor(kernel, h) * inner_deriv, zero(q)) - return result + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + # See the docs page "Development" for more details on `div_fast`. + return div_fast(kernel_deriv, r) end @inline compact_support(::Poly6Kernel, h) = h -# Note that `315 // 64` saves one instruction but is significantly slower on GPUs (for now) -@inline normalization_factor(::Poly6Kernel{2}, h) = 4 / (pi * h^2) -@inline normalization_factor(::Poly6Kernel{3}, h) = 315 / (pi * h^3 * 64) +@inline function normalization_factor(::Poly6Kernel{2}, h_inv) + return oftype(h_inv, 4 / (pi)) * h_inv^2 +end + +@inline function normalization_factor(::Poly6Kernel{3}, h_inv) + return oftype(h_inv, 315 / (64 * pi)) * h_inv^3 +end @doc raw""" SpikyKernel{NDIMS}() @@ -634,38 +675,39 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct SpikyKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@inline function kernel(kernel::SpikyKernel, r::Real, h) - q = r / h - - result = (1 - q)^3 - - # Zero out result if q >= 1 - result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q)) +@inline function kernel_unsafe(kernel::SpikyKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - return result + return normalization_factor(kernel, h_inv) * (1 - q)^3 end -@inline function kernel_deriv(kernel::SpikyKernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::SpikyKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - result = -3 * (1 - q)^2 + kernel_deriv = -3 * normalization_factor(kernel, h_inv) * (1 - q)^2 * h_inv - # Zero out result if q >= 1 - result = ifelse(q < 1, result * normalization_factor(kernel, h) * inner_deriv, zero(q)) - - return result + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + # See the docs page "Development" for more details on `div_fast`. + return div_fast(kernel_deriv, r) end @inline compact_support(::SpikyKernel, h) = h -@inline normalization_factor(::SpikyKernel{2}, h) = 10 / (pi * h^2) -@inline normalization_factor(::SpikyKernel{3}, h) = 15 / (pi * h^3) +@inline function normalization_factor(::SpikyKernel{2}, h_inv) + return oftype(h_inv, 10 / pi) * h_inv^2 +end + +@inline function normalization_factor(::SpikyKernel{3}, h_inv) + return oftype(h_inv, 15 / pi) * h_inv^3 +end @doc raw""" LaguerreGaussKernel{NDIMS}() -Truncated Laguerre–Gauss kernel (fourth-order smoothing) as defined in +Truncated Laguerre-Gauss kernel (fourth-order smoothing) as defined in [Wang2024](@cite). Its radial form uses ``q = r/h`` and is truncated at ``q = 2`` (compact support ``2h``): @@ -716,51 +758,65 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct LaguerreGaussKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@fastpow @inline function kernel(kernel::LaguerreGaussKernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_unsafe(kernel::LaguerreGaussKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # polynomial part: 1 - s^2/2 + s^4/6 + # Polynomial part poly = 1 - q^2 / 2 + q^4 / 6 - # zero out for s ≥ 2 - return ifelse(q < 2, normalization_factor(kernel, h) * poly * exp(-q^2), zero(q)) + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) + + return normalization_factor(kernel, h_inv) * poly * exp_q2 end -@inline function kernel_deriv(kernel::LaguerreGaussKernel, r::Real, h) - invh = 1 / h - q = r * invh +@inline function kernel_deriv_div_r_unsafe(kernel::LaguerreGaussKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv + + # dg/dq = (q / 3) * (-q^4 + 5q^2 - 9) * exp(-q^2), but we already divide by `r`` + poly = (h_inv / 3) * ((-q^2 + 5) * q^2 - 9) - # dg/dq = (q/3)*(-q^4 + 5q^2 - 9) * exp(-q^2) - poly = ((-q^2 + 5) * q^2 - 9) * (q / 3) + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) - return ifelse(q < 2, normalization_factor(kernel, h) * exp(-q^2) * poly * invh, zero(q)) + return normalization_factor(kernel, h_inv) * poly * exp_q2 * h_inv end @inline compact_support(::LaguerreGaussKernel, h) = 2 * h -# Original normalization factors as in Wang2024 -# @inline normalization_factor(::LaguerreGaussKernel{1}, h) = (8 / (5 * sqrt(pi))) / h -# @inline normalization_factor(::LaguerreGaussKernel{2}, h) = (3 / (pi)) / (h^2) -# @inline normalization_factor(::LaguerreGaussKernel{3}, h) = (8 / (pi^(3 // 2))) / (h^3) + +# Original normalization factors as in Wang2024: +# 1D: (8 / (5 * sqrt(pi))) / h +# 2D: (3 / (pi)) / (h^2) +# 3D: (8 / (pi^(3 / 2))) / (h^3) # Renormalized to the truncated integral over [0,2h] -@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h) - # C = 1/(2*(7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4)) - C = oftype(h, 0.65428780253539) - # C' = C/h - return C / h +@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h_inv) + # C = 1 / (2 * ((7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4))) + C = oftype(h_inv, 0.65428780253539) + # C' = C / h + return C * h_inv end -@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h) +@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h_inv) # C = 2 * pi * (5 - 17 * exp(-4)) / 12 - C = oftype(h, 2.454963094351984) + C_inv = oftype(h_inv, 0.4073380990128333) # C' = 1 / (h^2 * C) - return 1 / (h^2 * C) + return C_inv * h_inv^2 end -@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h) +@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h_inv) # C = (7 * sqrt(pi) / 32) * erf(2) - (77 / 24) * exp(-4) - C = oftype(h, 0.3271479336905373) + # C_ = 1 / (4 * pi * C) + C_ = oftype(h_inv, 0.24324613836999895) - # 4*pi cannot be pulled into C otherwise the test fails because of differences in rounding - return 1 / (C * 4 * pi * h^3) + # C' = 1 / (4 * pi * h^3 * C) = C_ / (h^3) + return C_ * h_inv^3 end diff --git a/src/preprocessing/particle_packing/rhs.jl b/src/preprocessing/particle_packing/rhs.jl index 77cf460f5b..807fba072f 100644 --- a/src/preprocessing/particle_packing/rhs.jl +++ b/src/preprocessing/particle_packing/rhs.jl @@ -5,11 +5,26 @@ function interact!(dv, v_particle_system, u_particle_system, system_coords = current_coordinates(u_particle_system, system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff foreach_point_neighbor(system, neighbor_system, system_coords, neighbor_coords, semi) do particle, neighbor, pos_diff, distance - # Only consider particles with a distance > 0. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(system)^2) && return + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(system, pos_diff, + distance, particle) rho_a = system.initial_condition.density[particle] rho_b = neighbor_system.initial_condition.density[neighbor] @@ -22,8 +37,6 @@ function interact!(dv, v_particle_system, u_particle_system, p_b = system.background_pressure - grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle) - # This vanishes for uniform particle distributions dv_repulsive_pressure = -(2 / m_a) * V_a * V_b * p_b * grad_kernel diff --git a/src/schemes/boundary/open_boundary/rhs.jl b/src/schemes/boundary/open_boundary/rhs.jl index 85fcc7dfb8..971f3c8a83 100644 --- a/src/schemes/boundary/open_boundary/rhs.jl +++ b/src/schemes/boundary/open_boundary/rhs.jl @@ -10,6 +10,15 @@ function interact!(dv, v_particle_system, u_particle_system, system_coords = current_coordinates(u_particle_system, particle_system) neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(particle_system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_system_coords, semi; @@ -17,14 +26,21 @@ function interact!(dv, v_particle_system, u_particle_system, neighbor, pos_diff, distance + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(particle_system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. rho_a = @inbounds current_density(v_particle_system, particle_system, particle) rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) - grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) - m_a = @inbounds hydrodynamic_mass(particle_system, particle) m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) diff --git a/src/schemes/boundary/wall_boundary/rhs.jl b/src/schemes/boundary/wall_boundary/rhs.jl index e0553764ef..5d9aef7235 100644 --- a/src/schemes/boundary/wall_boundary/rhs.jl +++ b/src/schemes/boundary/wall_boundary/rhs.jl @@ -19,9 +19,27 @@ function interact!(dv, v_particle_system, u_particle_system, system_coords = current_coordinates(u_particle_system, particle_system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(particle_system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff. foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, semi) do particle, neighbor, pos_diff, distance + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(particle_system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + m_b = hydrodynamic_mass(neighbor_system, neighbor) rho_a = current_density(v_particle_system, particle_system, particle) @@ -30,8 +48,6 @@ function interact!(dv, v_particle_system, u_particle_system, v_diff = current_velocity(v_particle_system, particle_system, particle) - current_velocity(v_neighbor_system, neighbor_system, neighbor) - grad_kernel = smoothing_kernel_grad(boundary_model, pos_diff, distance, particle) - continuity_equation!(dv, density_calculator(neighbor_system), m_b, rho_a, rho_b, v_diff, grad_kernel, particle) end diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl index b91279702b..a6c98a5d64 100644 --- a/src/schemes/fluid/entropically_damped_sph/rhs.jl +++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl @@ -11,6 +11,15 @@ function interact!(dv, v_particle_system, u_particle_system, surface_tension_a = surface_tension_model(particle_system) surface_tension_b = surface_tension_model(neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(particle_system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, semi; @@ -18,6 +27,15 @@ function interact!(dv, v_particle_system, u_particle_system, neighbor, pos_diff, distance + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(particle_system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. @@ -38,8 +56,6 @@ function interact!(dv, v_particle_system, u_particle_system, m_a = @inbounds hydrodynamic_mass(particle_system, particle) m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) - grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) - dv_pressure = pressure_acceleration(particle_system, neighbor_system, particle, neighbor, m_a, m_b, p_a - p_avg, p_b - p_avg, rho_a, diff --git a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl index ca7c4b2c35..8cebf8d20c 100644 --- a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl +++ b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl @@ -10,6 +10,15 @@ function interact!(dv, v_particle_system, u_particle_system, system_coords = current_coordinates(u_particle_system, particle_system) neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(particle_system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff. foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_system_coords, semi; @@ -17,14 +26,21 @@ function interact!(dv, v_particle_system, u_particle_system, neighbor, pos_diff, distance + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(particle_system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. rho_a = @inbounds current_density(v_particle_system, particle_system, particle) rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) - grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) - m_a = @inbounds hydrodynamic_mass(particle_system, particle) m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) diff --git a/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl b/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl index 9837eb1c69..be609e5576 100644 --- a/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl +++ b/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl @@ -221,8 +221,11 @@ end OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang}}, grad_kernel) # Density diffusion terms are all zero for distance zero. - # See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return + # If `skip_zero_distance` is `true`, we can assume that this function isn't called + # for distance zero because these neighbors have already been skipped. + if !skip_zero_distance(particle_system) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return + end # Since this is one of the most performance critical functions, using fast divisions # here gives a significant speedup on GPUs. diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index f70891eae3..96915d8b14 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -19,6 +19,15 @@ function interact!(dv, v_particle_system, u_particle_system, # the following code and the two other lines below that are marked as "debug example". # debug_array = zeros(ndims(particle_system), nparticles(particle_system)) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(particle_system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_system_coords, semi; @@ -26,6 +35,15 @@ function interact!(dv, v_particle_system, u_particle_system, neighbor, pos_diff, distance + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(particle_system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. @@ -39,8 +57,6 @@ function interact!(dv, v_particle_system, u_particle_system, surface_tension_correction) = free_surface_correction(correction, particle_system, rho_mean) - grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) - m_a = @inbounds hydrodynamic_mass(particle_system, particle) m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) diff --git a/src/schemes/structure/discrete_element_method/rhs.jl b/src/schemes/structure/discrete_element_method/rhs.jl index 0fd327b10c..ee16f2a63d 100644 --- a/src/schemes/structure/discrete_element_method/rhs.jl +++ b/src/schemes/structure/discrete_element_method/rhs.jl @@ -6,14 +6,22 @@ function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, system_coords = current_coordinates(u_particle_system, particle_system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the radius `r`, `distance^2` is in + # the order of `r^2`, so we need to check `distance < sqrt(eps(r^2))`. + # Note that `sqrt(eps(r^2)) != eps(r)`. + r = maximum(particle_system.radius) + almostzero = sqrt(eps(r^2)) + foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, semi; points=each_integrated_particle(particle_system)) do particle, neighbor, pos_diff, distance - # See `src/general/smoothing_kernels.jl` for more details - distance^2 < eps(first(particle_system.radius)^2) && return + distance < almostzero && return # Retrieve particle properties m_a = particle_system.mass[particle] diff --git a/src/schemes/structure/structure.jl b/src/schemes/structure/structure.jl index 5fe6d08f1d..ff872c04fc 100644 --- a/src/schemes/structure/structure.jl +++ b/src/schemes/structure/structure.jl @@ -17,10 +17,8 @@ end end end -function interact_structure_fluid!(dv, v_particle_system, - u_particle_system, - v_neighbor_system, - u_neighbor_system, +function interact_structure_fluid!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, particle_system, neighbor_system::AbstractFluidSystem, semi) @@ -28,18 +26,32 @@ function interact_structure_fluid!(dv, v_particle_system, system_coords = current_coordinates(u_particle_system, particle_system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(neighbor_system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff. - foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, - semi; + foreach_point_neighbor(particle_system, neighbor_system, + system_coords, neighbor_coords, semi; points=each_integrated_particle(particle_system)) do particle, neighbor, pos_diff, distance - # Only consider particles with a distance > 0. - # See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return - - grad_kernel = smoothing_kernel_grad(neighbor_system, pos_diff, distance, neighbor) + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(neighbor_system) && distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + # Note that we use the `neighbor_system` to compute the kernel gradient + # to obtain the same force as in the fluid-structure interaction. + grad_kernel = smoothing_kernel_grad_unsafe(neighbor_system, pos_diff, + distance, neighbor) m_b = hydrodynamic_mass(neighbor_system, neighbor) diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl index a0fb522b4d..3383929c28 100644 --- a/src/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl @@ -20,22 +20,33 @@ end # Everything here is done in the initial coordinates system_coords = initial_coordinates(system) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff. # For structure-structure interaction, this has to happen in the initial coordinates. foreach_point_neighbor(system, system, system_coords, system_coords, semi; points=each_integrated_particle(system)) do particle, neighbor, initial_pos_diff, initial_distance - # Only consider particles with a distance > 0. - # See `src/general/smoothing_kernels.jl` for more details. - initial_distance^2 < eps(initial_smoothing_length(system)^2) && return + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(system) && initial_distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff, + initial_distance, particle) rho_a = @inbounds system.material_density[particle] rho_b = @inbounds system.material_density[neighbor] - grad_kernel = smoothing_kernel_grad(system, initial_pos_diff, - initial_distance, particle) - m_a = @inbounds system.mass[particle] m_b = @inbounds system.mass[neighbor] diff --git a/src/schemes/structure/total_lagrangian_sph/system.jl b/src/schemes/structure/total_lagrangian_sph/system.jl index cac4ed1f12..b4fb02f31a 100644 --- a/src/schemes/structure/total_lagrangian_sph/system.jl +++ b/src/schemes/structure/total_lagrangian_sph/system.jl @@ -475,13 +475,27 @@ end # Reset deformation gradient set_zero!(deformation_grad) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient + # and the density diffusion divide by zero. + # To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in + # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`. + # Note that `sqrt(eps(h^2)) != eps(h)`. + h = initial_smoothing_length(system) + almostzero = sqrt(eps(h^2)) + # Loop over all pairs of particles and neighbors within the kernel cutoff initial_coords = initial_coordinates(system) foreach_point_neighbor(system, system, initial_coords, initial_coords, semi) do particle, neighbor, initial_pos_diff, initial_distance - # Only consider particles with a distance > 0. - # See `src/general/smoothing_kernels.jl` for more details. - initial_distance^2 < eps(initial_smoothing_length(system)^2) && return + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(system) && initial_distance < almostzero && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff, + initial_distance, particle) volume = @inbounds mass[neighbor] / material_density[neighbor] pos_diff_ = @inbounds current_coords(system, particle) - @@ -489,9 +503,6 @@ end # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference pos_diff = convert.(eltype(system), pos_diff_) - grad_kernel = smoothing_kernel_grad(system, initial_pos_diff, - initial_distance, particle) - # Multiply by L_{0a} L = @inbounds correction_matrix(system, particle) diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 3eae7b839b..466297b355 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -331,10 +331,8 @@ end fluid_density_calculator=SummationDensity(), maxiters=38, # 38 time steps on CPU clip_negative_pressure=true), - # Broken due to https://github.com/JuliaGPU/CUDA.jl/issues/2681 - # and https://github.com/JuliaGPU/Metal.jl/issues/550. - # "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1, - # smoothing_kernel=SchoenbergQuarticSplineKernel{2}()), + "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1, + smoothing_kernel=SchoenbergQuarticSplineKernel{2}()), "WCSPH with SchoenbergQuinticSplineKernel" => (smoothing_length=1.1, smoothing_kernel=SchoenbergQuinticSplineKernel{2}()), "WCSPH with WendlandC2Kernel" => (smoothing_length=1.5, diff --git a/test/general/smoothing_kernels.jl b/test/general/smoothing_kernels.jl index fdd7f9a142..e112d9562b 100644 --- a/test/general/smoothing_kernels.jl +++ b/test/general/smoothing_kernels.jl @@ -1,19 +1,29 @@ @testset verbose=true "Smoothing Kernels" begin # Don't show all kernel tests in the final overview @testset verbose=false "Integral" begin - # All smoothing kernels should integrate to something close to 1 + # All smoothing kernels should integrate to something close to 1. + # We integrate slightly beyond the compact support to verify that the kernel is + # correctly evaluating to zero there. function integrate_kernel_2d(smk) integral_2d_radial, _ = quadgk(r -> r * TrixiParticles.kernel(smk, r, 1.0), 0, - TrixiParticles.compact_support(smk, 1.0), + TrixiParticles.compact_support(smk, 1.0) * 1.1, rtol=1e-15) return 2 * pi * integral_2d_radial end + function integrate_kernel_1d(smk) + integral_1d_half, + _ = quadgk(r -> TrixiParticles.kernel(smk, r, 1.0), 0, + TrixiParticles.compact_support(smk, 1.0) * 1.1, + rtol=1e-15) + return 2 * integral_1d_half + end + function integrate_kernel_3d(smk) integral_3d_radial, _ = quadgk(r -> r^2 * TrixiParticles.kernel(smk, r, 1.0), 0, - TrixiParticles.compact_support(smk, 1.0), + TrixiParticles.compact_support(smk, 1.0) * 1.1, rtol=1e-15) return 4 * pi * integral_3d_radial end @@ -41,6 +51,13 @@ LaguerreGaussKernel ] + kernels_1d = [ + SchoenbergCubicSplineKernel, + SchoenbergQuarticSplineKernel, + SchoenbergQuinticSplineKernel, + LaguerreGaussKernel + ] + @testset "$kernel" for kernel in kernels # The integral should be 1 for all kernels error_2d = abs(integrate_kernel_2d(kernel{2}()) - 1.0) @@ -48,6 +65,11 @@ @test error_2d <= 1e-15 @test error_3d <= 1e-15 + + if kernel in kernels_1d + error_1d = abs(integrate_kernel_1d(kernel{1}()) - 1.0) + @test error_1d <= 1e-15 + end end end @@ -67,16 +89,31 @@ LaguerreGaussKernel ] + kernels_1d = [ + SchoenbergCubicSplineKernel, + SchoenbergQuarticSplineKernel, + SchoenbergQuinticSplineKernel, + LaguerreGaussKernel + ] + # Test 4 different smoothing lengths smoothing_lengths = 0.25:0.25:1 - @testset "$kernel_type" for kernel_type in kernels - for ndims in 2:3 + for ndims in 1:3 + kernels_ndims = ndims == 1 ? kernels_1d : kernels + + @testset "$kernel_type{$ndims}" for kernel_type in kernels_ndims kernel_ = kernel_type{ndims}() for h in smoothing_lengths + compact_support_ = TrixiParticles.compact_support(kernel_, h) # Test 11 different radii - radii = 0:(0.1h):(h + eps()) + radii = 0:(0.1 * compact_support_):(compact_support_ * 1.01) + + if kernel_ isa SpikyKernel + # The Spiky kernel is not differentiable at r=0 + radii = radii[2:end] + end for r in radii # Automatic differentation of `kernel` @@ -111,11 +148,20 @@ LaguerreGaussKernel ] + kernels_1d = [ + SchoenbergCubicSplineKernel, + SchoenbergQuarticSplineKernel, + SchoenbergQuinticSplineKernel, + LaguerreGaussKernel + ] + # Test different smoothing length types smoothing_lengths = (0.5, 0.5f0) - @testset "$kernel_type" for kernel_type in kernels - for ndims in 2:3 + for ndims in 1:3 + kernels_ndims = ndims == 1 ? kernels_1d : kernels + + @testset "$kernel_type{$ndims}" for kernel_type in kernels_ndims kernel_ = kernel_type{ndims}() for h in smoothing_lengths diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl index 69f4a920c4..2b145859df 100644 --- a/test/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl @@ -93,10 +93,11 @@ TrixiParticles.each_integrated_particle(::MockSystem) = each_integrated_particle TrixiParticles.smoothing_length(::MockSystem, _) = eps() - function TrixiParticles.add_acceleration!(_, _, ::MockSystem) - return nothing + function TrixiParticles.smoothing_kernel_grad_unsafe(::MockSystem, + pos_diff, distance, + particle) + return kernel_deriv * pos_diff / distance end - TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) = kernel_deriv TrixiParticles.compact_support(::MockSystem, ::MockSystem) = 1000.0 function TrixiParticles.current_coords(system::MockSystem, particle) return TrixiParticles.current_coords(initial_coordinates, system, particle) diff --git a/test/systems/tlsph_system.jl b/test/systems/tlsph_system.jl index 1e7b1edff5..3bc3ea2ceb 100644 --- a/test/systems/tlsph_system.jl +++ b/test/systems/tlsph_system.jl @@ -184,10 +184,11 @@ Base.getindex(::Val{:mock_material_density}, ::Int64) = density - function TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) - return kernel_derivative + function TrixiParticles.smoothing_kernel_grad_unsafe(::Val{:mock_system_tensor}, + pos_diff, distance, + particle) + return kernel_derivative * pos_diff / distance end - Base.eps(::Type{Val{:mock_smoothing_length}}) = eps() semi = DummySemidiscretization() # Compute deformation gradient