diff --git a/Project.toml b/Project.toml
index 4b1b864090..85abbabaca 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TrixiParticles"
 uuid = "66699cd8-9c01-4e9d-a059-b96c86d16b3a"
-authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
 version = "0.4.5-dev"
+authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
@@ -18,7 +18,6 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221"
 PointNeighbors = "1c4d5385-0a27-49de-8e2c-43b175c8985c"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -57,7 +56,6 @@ ForwardDiff = "1"
 GPUArraysCore = "0.2"
 JSON = "1"
 KernelAbstractions = "0.9"
-MuladdMacro = "0.2"
 OrdinaryDiffEq = "6.91"
 OrdinaryDiffEqCore = "2, 3"
 PointNeighbors = "0.6.5"
diff --git a/docs/literate/src/tut_custom_kernel.jl b/docs/literate/src/tut_custom_kernel.jl
index 25a3bed683..053f0c74a0 100644
--- a/docs/literate/src/tut_custom_kernel.jl
+++ b/docs/literate/src/tut_custom_kernel.jl
@@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end
 
 # By looking at the implementation of existing kernels in TrixiParticles.jl,
 # we can see that a kernel implementation requires three functions.
-# `TrixiParticles.kernel`, which is the kernel function itself,
-# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function,
-# and `TrixiParticles.compact_support`, which defines the compact support of the
-# kernel in relation to the smoothing length.
+# `TrixiParticles.kernel_unsafe`, which is the kernel function itself,
+# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the
+# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines
+# the compact support of the kernel in relation to the smoothing length.
 # The latter is relevant for determining the search radius of the neighborhood search.
-function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h)
+#
+# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since
+# this avoids an extra division in the hot loop and is robust near ``r=0``.
+# The public function `TrixiParticles.kernel_deriv` is defined automatically by
+# TrixiParticles from this method (and multiplies by ``r`` again when needed).
+# In the unsafe functions, we do not check the compact support; this is handled in the
+# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the
+# compact support defined in `TrixiParticles.compact_support`.
+function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * exp(-q^2)
-    end
-
-    return 0.0
+    return 1 / (pi * h^2) * exp(-q^2)
 end
 
-function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h)
+function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
-    end
-
-    return 0.0
+    kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
+    return kernel_deriv / r
 end
 
 TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h
diff --git a/docs/make.jl b/docs/make.jl
index 6863c72fb1..3c04970b77 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -72,7 +72,8 @@ makedocs(sitename="TrixiParticles.jl",
          plugins=[bib],
          # Run doctests and check docs for the following modules
          modules=[TrixiParticles, TrixiBase],
-         format=Documenter.HTML(; assets=Asciicast.assets()),
+         # Set edit_link explicitly to avoid `git remote show origin` lookups.
+         format=Documenter.HTML(; assets=Asciicast.assets(), edit_link="main"),
          # Explicitly specify documentation structure
          pages=[
              "Home" => "index.md",
diff --git a/docs/src/general/smoothing_kernels.md b/docs/src/general/smoothing_kernels.md
index 6e68b1152d..ea36cbf8aa 100644
--- a/docs/src/general/smoothing_kernels.md
+++ b/docs/src/general/smoothing_kernels.md
@@ -16,108 +16,107 @@ The following smoothing kernels are currently available:
 
 Any Kernel with a stability rating of more than '+++' doesn't suffer from pairing-instability.
 
-We recommend to use the [`WendlandC2Kernel`](@ref) for most applications.
+We recommend using the [`WendlandC2Kernel`](@ref) for most applications.
 If less smoothing is needed, try [`SchoenbergCubicSplineKernel`](@ref), for more smoothing try [`WendlandC6Kernel`](@ref).
 
 ```@eval
-
-using TrixiParticles                     
-using CairoMakie                         
-
-# --- Group the kernels for combined plotting ---  
-wendland_kernels = [                     
-    ("Wendland C2", WendlandC2Kernel{2}()),  
-    ("Wendland C4", WendlandC4Kernel{2}()),  
-    ("Wendland C6", WendlandC6Kernel{2}()),  
-]                                        
-
-schoenberg_kernels = [                   
-    ("Cubic Spline",   SchoenbergCubicSplineKernel{2}()),   
-    ("Quartic Spline", SchoenbergQuarticSplineKernel{2}()), 
-    ("Quintic Spline", SchoenbergQuinticSplineKernel{2}()), 
-]                                        
-
-other_kernels = [                        
-    ("Gaussian",       GaussianKernel{2}()),       
-    ("Poly6",          Poly6Kernel{2}()),          
-    ("Laguerre-Gauss", LaguerreGaussKernel{2}()), 
-]                                        
-
-spiky_kernel_group = [                   
-    ("Spiky Kernel", SpikyKernel{2}()),  
-]                                        
-
-# A list of all kernel groups to be plotted                  
-# A boolean flag controls whether to apply the consistent y-range  
-kernel_groups = [                                            
-    (title="Wendland Kernels",          kernels=wendland_kernels,   use_consistent_range=true),  
-    (title="Schoenberg Spline Kernels", kernels=schoenberg_kernels, use_consistent_range=true),  
-    (title="Other Kernels",             kernels=other_kernels,      use_consistent_range=true),  
-    (title="Spiky Kernel",              kernels=spiky_kernel_group, use_consistent_range=false), 
-]                                                            
-
-# --- Pre-calculate global y-ranges for consistency ---      
-kernels_for_range_calc = vcat(wendland_kernels, schoenberg_kernels, other_kernels)  
-
-q_range = range(0, 3, length=300)     
-h = 1.0                               
-min_val, max_val = Inf, -Inf          
-min_deriv, max_deriv = Inf, -Inf      
-
-for (_, kernel_obj) in kernels_for_range_calc                             
-    kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range]      
+using TrixiParticles
+using CairoMakie
+
+# --- Group the kernels for combined plotting ---
+wendland_kernels = [
+    ("Wendland C2", WendlandC2Kernel{2}()),
+    ("Wendland C4", WendlandC4Kernel{2}()),
+    ("Wendland C6", WendlandC6Kernel{2}()),
+]
+
+schoenberg_kernels = [
+    ("Cubic Spline",   SchoenbergCubicSplineKernel{2}()),
+    ("Quartic Spline", SchoenbergQuarticSplineKernel{2}()),
+    ("Quintic Spline", SchoenbergQuinticSplineKernel{2}()),
+]
+
+other_kernels = [
+    ("Gaussian",       GaussianKernel{2}()),
+    ("Poly6",          Poly6Kernel{2}()),
+    ("Laguerre-Gauss", LaguerreGaussKernel{2}()),
+]
+
+spiky_kernel_group = [
+    ("Spiky Kernel", SpikyKernel{2}()),
+]
+
+# A list of all kernel groups to be plotted
+# A boolean flag controls whether to apply the consistent y-range
+kernel_groups = [
+    (title="Wendland Kernels",          kernels=wendland_kernels,   use_consistent_range=true),
+    (title="Schoenberg Spline Kernels", kernels=schoenberg_kernels, use_consistent_range=true),
+    (title="Other Kernels",             kernels=other_kernels,      use_consistent_range=true),
+    (title="Spiky Kernel",              kernels=spiky_kernel_group, use_consistent_range=false),
+]
+
+# --- Pre-calculate global y-ranges for consistency ---
+kernels_for_range_calc = vcat(wendland_kernels, schoenberg_kernels, other_kernels)
+
+q_range = range(0, 3, length=300)
+h = 1.0
+min_val, max_val = Inf, -Inf
+min_deriv, max_deriv = Inf, -Inf
+
+for (_, kernel_obj) in kernels_for_range_calc
+    kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range]
     kernel_derivs = [TrixiParticles.kernel_deriv(kernel_obj, q, h) for q in q_range]
 
-    global min_val = min(min_val, minimum(kernel_values))     
-    global max_val = max(max_val, maximum(kernel_values))     
-    global min_deriv = min(min_deriv, minimum(kernel_derivs)) 
-    global max_deriv = max(max_deriv, maximum(kernel_derivs)) 
-end                                                                      
+    global min_val = min(min_val, minimum(kernel_values))
+    global max_val = max(max_val, maximum(kernel_values))
+    global min_deriv = min(min_deriv, minimum(kernel_derivs))
+    global max_deriv = max(max_deriv, maximum(kernel_derivs))
+end
 
-# Add 10% padding to the y-limits for better visuals          
-y_range_val = (min_val - 0.1 * (max_val - min_val),           
-               max_val + 0.1 * (max_val - min_val))           
-y_range_deriv = (min_deriv - 0.1 * (max_deriv - min_deriv),   
-                 max_deriv + 0.1 * (max_deriv - min_deriv))   
+# Add 10% padding to the y-limits for better visuals
+y_range_val = (min_val - 0.1 * (max_val - min_val),
+               max_val + 0.1 * (max_val - min_val))
+y_range_deriv = (min_deriv - 0.1 * (max_deriv - min_deriv),
+                 max_deriv + 0.1 * (max_deriv - min_deriv))
 
-fig = Figure(size = (1000, 1200), fontsize=16)                
+fig = Figure(size = (1000, 1200), fontsize=16)
 
-for (i, group) in enumerate(kernel_groups)                    
-    ax_val = Axis(fig[i, 1],                                  
-                  xlabel = "q = r/h", ylabel = "w(q)",        
-                  title = group.title)                        
+for (i, group) in enumerate(kernel_groups)
+    ax_val = Axis(fig[i, 1],
+                  xlabel = "q = r/h", ylabel = "w(q)",
+                  title = group.title)
 
-    ax_deriv = Axis(fig[i, 2],                                
-                    xlabel = "q = r/h", ylabel = "w'(q)",     
-                    title = group.title)                      
+    ax_deriv = Axis(fig[i, 2],
+                    xlabel = "q = r/h", ylabel = "w'(q)",
+                    title = group.title)
 
-    if group.use_consistent_range                             
-        ylims!(ax_val, y_range_val)                           
-        ylims!(ax_deriv, y_range_deriv)                       
-    end                                                       
+    if group.use_consistent_range
+        ylims!(ax_val, y_range_val)
+        ylims!(ax_deriv, y_range_deriv)
+    end
 
-    hlines!(ax_val, [0.0], linestyle = :dash)                 
-    hlines!(ax_deriv, [0.0], linestyle = :dash)               
+    hlines!(ax_val, [0.0], linestyle = :dash)
+    hlines!(ax_deriv, [0.0], linestyle = :dash)
 
-    for (name, kernel_obj) in group.kernels                   
-        kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range]    
-        kernel_derivs = [TrixiParticles.kernel_deriv(kernel_obj, q, h) for q in q_range]  
+    for (name, kernel_obj) in group.kernels
+        kernel_values = [TrixiParticles.kernel(kernel_obj, q, h) for q in q_range]
+        kernel_derivs = [TrixiParticles.kernel_deriv(kernel_obj, q, h) for q in q_range]
 
-        lines!(ax_val, q_range, kernel_values, label=name, linewidth=2.5)   
-        lines!(ax_deriv, q_range, kernel_derivs, label=name, linewidth=2.5) 
-    end                                                       
+        lines!(ax_val, q_range, kernel_values, label=name, linewidth=2.5)
+        lines!(ax_deriv, q_range, kernel_derivs, label=name, linewidth=2.5)
+    end
 
-    axislegend(ax_val, position = :rt)                        
-    axislegend(ax_deriv, position = :rt)                      
-end                                                           
+    axislegend(ax_val, position = :rt)
+    axislegend(ax_deriv, position = :rt)
+end
 
-# Add row gaps between the 4 rows (3 gaps total)              
-for i in 1:(length(kernel_groups) - 1)                        
-    rowgap!(fig.layout, i, 25)                                
-end                                                           
+# Add row gaps between the 4 rows (3 gaps total)
+for i in 1:(length(kernel_groups) - 1)
+    rowgap!(fig.layout, i, 25)
+end
 
 CairoMakie.save("smoothing_kernels.png", fig)
-                                                           
+
 ```
 
 ![Radial profiles and derivatives of the available smoothing kernels](smoothing_kernels.png)
@@ -125,18 +124,24 @@ CairoMakie.save("smoothing_kernels.png", fig)
 
 !!! note "Usage"
     The kernel can be called as
-    ```
+    ```jldoctest kernels; output = false, setup = :(using TrixiParticles, LinearAlgebra; smoothing_kernel = WendlandC2Kernel{2}(); h = 1.0; r = 0.5)
     TrixiParticles.kernel(smoothing_kernel, r, h)
+
+    # output
+    0.35250333098869013
     ```
     The length of the compact support can be obtained as
-    ```
+    ```jldoctest kernels; output = false
     TrixiParticles.compact_support(smoothing_kernel, h)
+
+    # output
+    2.0
     ```
 
     Note that ``r`` has to be a scalar, so in the context of SPH, the kernel
     should be used as
     ```math
-    W(\Vert r_a - r_b \Vert, h).
+        W(\Vert r_a - r_b \Vert, h).
     ```
 
     The gradient required in SPH,
@@ -144,10 +149,61 @@ CairoMakie.save("smoothing_kernels.png", fig)
         \nabla_{r_a} W(\Vert r_a - r_b \Vert, h)
     ```
     can be called as
-    ```
+    ```jldoctest kernels; output = false, setup = :(pos_diff = SVector(0.5, 0.5); distance = norm(pos_diff))
     TrixiParticles.kernel_grad(smoothing_kernel, pos_diff, distance, h)
+
+    # output
+    2-element SVector{2, Float64} with indices SOneTo(2):
+     -0.3762063922043116
+     -0.3762063922043116
+    ```
+    where `pos_diff` is $r_a - r_b$ and `distance` is $\Vert r_a - r_b \Vert$,
+    although in most cases, it is recommended to use
+    ```jldoctest kernels; output = false, setup=:(trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "hydrostatic_water_column_2d.jl"), sol=nothing); system = fluid_system; particle = 1)
+    TrixiParticles.smoothing_kernel_grad(system, pos_diff, distance, particle)
+
+    # output
+    2-element SVector{2, Float64} with indices SOneTo(2):
+     0.0
+     0.0
+    ```
+    instead, which will also take into account if the system is using a corrected kernel.
+
+    The equivalent function to obtain the kernel value is
+    ```jldoctest kernels; output = false
+    TrixiParticles.smoothing_kernel(system, distance, particle)
+
+    # output
+    0.0
+    ```
+
+    In performance-critical code, it is recommended to use the `_unsafe` versions of these
+    functions, which skip the compact support and zero distance checks:
+    ```jldoctest kernels; output = false
+    TrixiParticles.smoothing_kernel_unsafe(system, distance, particle)
+    TrixiParticles.smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+
+    # output
+    2-element SVector{2, Float64} with indices SOneTo(2):
+     -106899.65266169122
+     -106899.65266169122
+    ```
+    These functions are only safe to use if compact support and zero distance have already
+    been checked.
+    The compact support is automatically checked when inside a `foreach_point_neighbor`
+    or `foreach_neighbor` loop.
+    For the zero distance check, use
+    ```jldoctest kernels; output = false, setup = :(almostzero = 1e-12)
+    TrixiParticles.skip_zero_distance(system) && distance < almostzero
+
+    # output
+    false
     ```
-    where `pos_diff` is $r_a - r_b$ and `distance` is $\Vert r_a - r_b \Vert$.
+    where `skip_zero_distance` returns `true` if the kernel gradient is zero at zero
+    distance, which is not the case for some corrected kernels.
+    Here, `almostzero` is a small threshold, which should be chosen as `sqrt(eps(h^2))`,
+    where `h` is the smoothing length, since `distance^2` is in the order of `h^2`.
+    Note that `sqrt(eps(h^2)) != eps(h)`.
 
 ```@autodocs
 Modules = [TrixiParticles]
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
index faa1fd2956..215a2b3bf1 100644
--- a/src/TrixiParticles.jl
+++ b/src/TrixiParticles.jl
@@ -17,7 +17,6 @@ using GPUArraysCore: AbstractGPUArray
 using JSON: JSON
 using KernelAbstractions: KernelAbstractions, @kernel, @index
 using LinearAlgebra: norm, normalize, cross, dot, I, tr, inv, pinv, det
-using MuladdMacro: @muladd
 using Polyester: Polyester, @batch
 using Printf: @printf, @sprintf
 using ReadVTK: ReadVTK
diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 10ccb6261f..fde3b92a7b 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -130,10 +130,36 @@ end
     return kernel(smoothing_kernel, distance, smoothing_length(system, particle))
 end
 
+@inline function smoothing_kernel_unsafe(system, distance, particle)
+    (; smoothing_kernel) = system
+    return kernel_unsafe(smoothing_kernel, distance, smoothing_length(system, particle))
+end
+
+@inline function skip_zero_distance(system::AbstractSystem)
+    return skip_zero_distance(system_correction(system))
+end
+
+# Robust/safe version of the function below. In performance-critical code, manually check
+# the kernel support, call `skip_zero_distance` and then `smoothing_kernel_grad_unsafe`.
 @inline function smoothing_kernel_grad(system, pos_diff, distance, particle)
-    return corrected_kernel_grad(system_smoothing_kernel(system), pos_diff,
-                                 distance, smoothing_length(system, particle),
-                                 system_correction(system), system, particle)
+    h = smoothing_length(system, particle)
+    compact_support_ = compact_support(system_smoothing_kernel(system), h)
+
+    # Note that `sqrt(eps(h^2)) != eps(h)`
+    if distance >= compact_support_ ||
+       (skip_zero_distance(system) && distance^2 < eps(h^2))
+        return zero(pos_diff)
+    end
+
+    return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+end
+
+# Skip the zero distance and compact support checks for maximum performance.
+# Only call this when you are sure that `0 < distance < compact_support`.
+@inline function smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+    return corrected_kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff,
+                                        distance, smoothing_length(system, particle),
+                                        system_correction(system), system, particle)
 end
 
 # System updates do nothing by default, but can be dispatched if needed
diff --git a/src/general/corrections.jl b/src/general/corrections.jl
index 292523feb8..77f4fae0dd 100644
--- a/src/general/corrections.jl
+++ b/src/general/corrections.jl
@@ -204,6 +204,15 @@ function compute_correction_values!(system,
 
         neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+        # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+        # and the density diffusion divide by zero.
+        # To account for rounding errors, we check if `distance` is almost zero.
+        # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+        # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+        # Note that `sqrt(eps(h^2)) != eps(h)`.
+        h = initial_smoothing_length(system)
+        almostzero = sqrt(eps(h^2))
+
         # Loop over all pairs of particles and neighbors within the kernel cutoff
         foreach_point_neighbor(system, neighbor_system, system_coords, neighbor_coords,
                                semi) do particle, neighbor, pos_diff, distance
@@ -217,10 +226,12 @@ function compute_correction_values!(system,
 
             kernel_correction_coefficient[particle] += volume * W
 
-            # Only consider particles with a distance > 0. See `src/general/smoothing_kernels.jl` for more details.
-            if distance^2 > eps(initial_smoothing_length(system)^2)
-                grad_W = kernel_grad(system_smoothing_kernel(system), pos_diff, distance,
-                                     smoothing_length(system, particle))
+            # Only consider particles with a distance > 0.
+            if distance > almostzero
+                # Now that we know that `distance` is not zero, we can safely call the
+                # unsafe version of the kernel gradient to avoid redundant zero checks.
+                grad_W = kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff,
+                                            distance, smoothing_length(system, particle))
                 tmp = volume * grad_W
                 for i in axes(dw_gamma, 1)
                     dw_gamma[i, particle] += tmp[i]
@@ -337,31 +348,36 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system,
         v_neighbor_system = wrap_v(v_ode, neighbor_system, semi)
 
         neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+        almostzero = sqrt(eps(compact_support(system, neighbor_system)^2))
 
         foreach_point_neighbor(system, neighbor_system, coordinates, neighbor_coords,
                                semi) do particle, neighbor, pos_diff, distance
-            volume = hydrodynamic_mass(neighbor_system, neighbor) /
-                     current_density(v_neighbor_system, neighbor_system, neighbor)
-            smoothing_length_ = smoothing_length(system, particle)
-
-            function compute_grad_kernel(correction, smoothing_kernel, pos_diff, distance,
-                                         smoothing_length_, system, particle)
-                return smoothing_kernel_grad(system, pos_diff, distance, particle)
+            function kernel_grad_local(correction, smoothing_kernel, pos_diff, distance,
+                                       smoothing_length_, system, particle)
+                return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
             end
 
             # Compute gradient of corrected kernel
-            function compute_grad_kernel(correction::MixedKernelGradientCorrection,
-                                         smoothing_kernel, pos_diff, distance,
-                                         smoothing_length_, system, particle)
-                return corrected_kernel_grad(smoothing_kernel, pos_diff, distance,
-                                             smoothing_length_, KernelCorrection(), system,
-                                             particle)
+            function kernel_grad_local(correction::MixedKernelGradientCorrection,
+                                       smoothing_kernel, pos_diff, distance,
+                                       smoothing_length_, system, particle)
+                return corrected_kernel_grad_unsafe(smoothing_kernel, pos_diff, distance,
+                                                    smoothing_length_, KernelCorrection(),
+                                                    system, particle)
             end
 
-            grad_kernel = compute_grad_kernel(correction, smoothing_kernel, pos_diff,
-                                              distance, smoothing_length_, system, particle)
+            # Skip neighbors with the same position if the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(correction) && distance < almostzero && return
 
-            iszero(grad_kernel) && return
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            smoothing_length_ = smoothing_length(system, particle)
+            grad_kernel = kernel_grad_local(correction, smoothing_kernel, pos_diff,
+                                            distance, smoothing_length_, system, particle)
+
+            volume = hydrodynamic_mass(neighbor_system, neighbor) /
+                     current_density(v_neighbor_system, neighbor_system, neighbor)
 
             L = volume * grad_kernel * pos_diff'
 
diff --git a/src/general/smoothing_kernels.jl b/src/general/smoothing_kernels.jl
index 4a97564648..456d55a13f 100644
--- a/src/general/smoothing_kernels.jl
+++ b/src/general/smoothing_kernels.jl
@@ -9,44 +9,86 @@ abstract type AbstractSmoothingKernel{NDIMS} end
     # `distance^2` is in the order of `h^2`, hence the comparison `distance^2 < eps(h^2)`.
     # Note that this is faster than `distance < sqrt(eps(h^2))`.
     # Also note that `sqrt(eps(h^2)) != eps(h)`.
-    distance^2 < eps(h^2) && return zero(pos_diff)
+    compact_support_ = compact_support(kernel, h)
+    nonzero = distance < compact_support_ && distance^2 > eps(h^2)
+    nonzero || return zero(pos_diff)
 
-    return kernel_deriv(kernel, distance, h) / distance * pos_diff
+    # Now we can use `kernel_grad_unsafe` without worrying about division by zero
+    return kernel_grad_unsafe(kernel, pos_diff, distance, h)
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, correction, system,
-                                       particle)
-    return kernel_grad(kernel, pos_diff, distance, h)
+# Skip the zero distance and compact support checks for maximum performance.
+# Only call this when you are sure that `distance` is not zero and within the compact support.
+@inline function kernel_grad_unsafe(kernel, pos_diff, distance, h)
+    # Call an optimized version of the kernel derivative, skipping redundant checks.
+    # Since this is used as `kernel_deriv / distance * pos_diff`, we already divide by `r`
+    # (or rather skip a multiplication by `r`) inside `kernel_deriv_div_r_unsafe`
+    # to save expensive divisions.
+    return kernel_deriv_div_r_unsafe(kernel, distance, h) * pos_diff
 end
 
-@inline function corrected_kernel_grad(kernel_, pos_diff, distance, h, ::KernelCorrection,
-                                       system, particle)
+@inline function kernel(kernel, r::Real, h)
+    # Zero out result if outside of compact support
+    compact_support_ = compact_support(kernel, h)
+    return ifelse(r < compact_support_, kernel_unsafe(kernel, r, h), zero(r))
+end
+
+@inline function kernel_deriv(kernel, r::Real, h)
+    # Zero out result if outside of compact support or if `r` is almost zero
+    # (to avoid division by zero in the unsafe version).
+    compact_support_ = compact_support(kernel, h)
+    if r < compact_support_ && r^2 > eps(h^2)
+        # The unsafe version returns the kernel derivative divided by `r`,
+        # so we multiply it by `r` to get the actual derivative.
+        return kernel_deriv_div_r_unsafe(kernel, r, h) * r
+    end
+
+    return zero(r)
+end
+
+@inline function skip_zero_distance(correction)
+    return true
+end
+
+@inline function skip_zero_distance(::Union{KernelCorrection,
+                                            MixedKernelGradientCorrection})
+    # For these corrections, the kernel gradient between a particle and itself is not zero
+    return false
+end
+
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, correction,
+                                              system, particle)
+    return kernel_grad_unsafe(kernel, pos_diff, distance, h)
+end
+
+@inline function corrected_kernel_grad_unsafe(kernel_, pos_diff, distance, h,
+                                              ::KernelCorrection, system, particle)
     return (kernel_grad(kernel_, pos_diff, distance, h) .-
             kernel(kernel_, distance, h) * dw_gamma(system, particle)) /
            kernel_correction_coefficient(system, particle)
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h,
-                                       corr::BlendedGradientCorrection, system,
-                                       particle)
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h,
+                                              corr::BlendedGradientCorrection, system,
+                                              particle)
     (; blending_factor) = corr
 
-    grad = kernel_grad(kernel, pos_diff, distance, h)
+    grad = kernel_grad_unsafe(kernel, pos_diff, distance, h)
     return (1 - blending_factor) * grad +
            blending_factor * correction_matrix(system, particle) * grad
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h,
-                                       ::GradientCorrection, system, particle)
-    grad = kernel_grad(kernel, pos_diff, distance, h)
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h,
+                                              ::GradientCorrection, system, particle)
+    grad = kernel_grad_unsafe(kernel, pos_diff, distance, h)
     return correction_matrix(system, particle) * grad
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h,
-                                       ::MixedKernelGradientCorrection, system,
-                                       particle)
-    grad = corrected_kernel_grad(kernel, pos_diff, distance, h, KernelCorrection(),
-                                 system, particle)
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h,
+                                              ::MixedKernelGradientCorrection, system,
+                                              particle)
+    grad = corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, KernelCorrection(),
+                                        system, particle)
     return correction_matrix(system, particle) * grad
 end
 
@@ -80,32 +122,40 @@ which is beneficial in regards to stability but makes it less accurate.
 """
 struct GaussianKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@inline @fastmath function kernel(kernel::GaussianKernel, r::Real, h)
-    q = r / h
+@inline function kernel_unsafe(kernel::GaussianKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
 
-    # Zero out result if q >= 3
-    result = ifelse(q < 3, normalization_factor(kernel, h) * exp(-q^2), zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * exp_q2
 end
 
-@inline @fastmath function kernel_deriv(kernel::GaussianKernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
-
-    # Zero out result if q >= 3
-    result = ifelse(q < 3,
-                    -2 * q * normalization_factor(kernel, h) * exp(-q^2) * inner_deriv,
-                    zero(q))
+@inline function kernel_deriv_div_r_unsafe(kernel::GaussianKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
 
-    return result
+    return -2 * normalization_factor(kernel, h_inv) * exp_q2 * h_inv^2
 end
 
 @inline compact_support(::GaussianKernel, h) = 3 * h
 
-@inline normalization_factor(::GaussianKernel{2}, h) = 1 / (pi * h^2)
-# First convert `pi` to the type of `h` to preserve the type of `h`
-@inline normalization_factor(::GaussianKernel{3}, h) = 1 / (oftype(h, pi)^(3 // 2) * h^3)
+@inline function normalization_factor(::GaussianKernel{2}, h_inv)
+    return oftype(h_inv, 1 / pi) * h_inv^2
+end
+
+@inline function normalization_factor(::GaussianKernel{3}, h_inv)
+    # 1 / (pi^1.5 * h^3)
+    return oftype(h_inv, 0.17958712212516656) * h_inv^3
+end
 
 @doc raw"""
     SchoenbergCubicSplineKernel{NDIMS}()
@@ -140,40 +190,41 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct SchoenbergCubicSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@muladd @inline function kernel(kernel::SchoenbergCubicSplineKernel, r::Real, h)
-    q = r / h
-
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl.
-    result = 1 * (2 - q)^3 / 4
-    result = result - (q < 1) * (1 - q)^3
+@inline function kernel_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(result))
+    result = 1 * (2 - q)^3 / 4 - (q < 1) * (1 - q)^3
 
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@muladd @inline function kernel_deriv(kernel::SchoenbergCubicSplineKernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = -3 * (2 - q)^2 / 4
-    result = result + 3 * (q < 1) * (1 - q)^2
+    result = -3 * (2 - q)^2 / 4 + 3 * (q < 1) * (1 - q)^2
+    kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result * inner_deriv,
-                    zero(result))
-
-    return result
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::SchoenbergCubicSplineKernel, h) = 2 * h
 
-# Note that `2 // 3 / h` saves one instruction but is significantly slower on GPUs (for now)
-@inline normalization_factor(::SchoenbergCubicSplineKernel{1}, h) = 2 / (3 * h)
-@inline normalization_factor(::SchoenbergCubicSplineKernel{2}, h) = 10 / (pi * h^2 * 7)
-@inline normalization_factor(::SchoenbergCubicSplineKernel{3}, h) = 1 / (pi * h^3)
+@inline function normalization_factor(::SchoenbergCubicSplineKernel{1}, h_inv)
+    return oftype(h_inv, 2 / 3) * h_inv
+end
+
+@inline function normalization_factor(::SchoenbergCubicSplineKernel{2}, h_inv)
+    return oftype(h_inv, 10 / (pi * 7)) * h_inv^2
+end
+
+@inline function normalization_factor(::SchoenbergCubicSplineKernel{3}, h_inv)
+    return oftype(h_inv, 1 / pi) * h_inv^3
+end
 
 @doc raw"""
     SchoenbergQuarticSplineKernel{NDIMS}()
@@ -217,54 +268,58 @@ struct SchoenbergQuarticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} en
 # to a power of three, see
 # https://github.com/JuliaLang/julia/blob/34934736fa4dcb30697ac1b23d11d5ad394d6a4d/base/intfuncs.jl#L327-L339
 # By using the `@fastpow` macro, we are consciously trading off some precision in the result
-# for enhanced computational speed. This is especially useful in scenarios where performance
-# is a higher priority than exact precision.
-@fastpow @muladd @inline function kernel(kernel::SchoenbergQuarticSplineKernel, r::Real, h)
-    q = r / h
+# for enhanced computational speed by using simple multiplications for higher powers.
+@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuarticSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Preserve the type of `q`
-    q5_2 = (5 // 2 - q)
-    q3_2 = (3 // 2 - q)
-    q1_2 = (1 // 2 - q)
+    # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason
+    q5_2 = 2.5f0 - q
+    q3_2 = 1.5f0 - q
+    q1_2 = 0.5f0 - q
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = q5_2^4
-    result = result - 5 * (q < 3 // 2) * q3_2^4
-    result = result + 10 * (q < 1 // 2) * q1_2^4
+    # Note: `q < 3 // 2` is not working on GPUs.
+    # See https://github.com/JuliaGPU/CUDA.jl/issues/2681.
+    result = q5_2^4 - 5 * (q < 1.5f0) * q3_2^4 + 10 * (q < 0.5f0) * q1_2^4
 
-    # Zero out result if q >= 5/2
-    result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result, zero(result))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuarticSplineKernel,
-                                               r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuarticSplineKernel,
+                                           r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Preserve the type of `q`
-    q5_2 = 5 // 2 - q
-    q3_2 = 3 // 2 - q
-    q1_2 = 1 // 2 - q
+    # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason
+    q5_2 = 2.5f0 - q
+    q3_2 = 1.5f0 - q
+    q1_2 = 0.5f0 - q
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = -4 * q5_2^3
-    result = result + 20 * (q < 3 // 2) * q3_2^3
-    result = result - 40 * (q < 1 // 2) * q1_2^3
+    # Note: `q < 3 // 2` is not working on GPUs.
+    # See https://github.com/JuliaGPU/CUDA.jl/issues/2681.
+    result = -4 * q5_2^3 + 20 * (q < 1.5f0) * q3_2^3 - 40 * (q < 0.5f0) * q1_2^3
 
-    # Zero out result if q >= 5/2
-    result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result * inner_deriv,
-                    zero(result))
+    kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv
 
-    return result
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    return div_fast(kernel_deriv, r)
 end
 
-@inline compact_support(::SchoenbergQuarticSplineKernel, h) = 5 * h / 2
+@inline compact_support(::SchoenbergQuarticSplineKernel, h) = oftype(h, 5 / 2) * h
 
-@inline normalization_factor(::SchoenbergQuarticSplineKernel{1}, h) = 1 / (24 * h)
-@inline normalization_factor(::SchoenbergQuarticSplineKernel{2}, h) = 96 / (pi * h^2 * 1199)
-@inline normalization_factor(::SchoenbergQuarticSplineKernel{3}, h) = 1 / (pi * h^3 * 20)
+@inline function normalization_factor(::SchoenbergQuarticSplineKernel{1}, h_inv)
+    return oftype(h_inv, 1 / 24) * h_inv
+end
+
+@inline function normalization_factor(::SchoenbergQuarticSplineKernel{2}, h_inv)
+    return oftype(h_inv, 96 / (pi * 1199)) * h_inv^2
+end
+
+@inline function normalization_factor(::SchoenbergQuarticSplineKernel{3}, h_inv)
+    return oftype(h_inv, 1 / (pi * 20)) * h_inv^3
+end
 
 @doc raw"""
     SchoenbergQuinticSplineKernel{NDIMS}()
@@ -301,48 +356,43 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct SchoenbergQuinticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@fastpow @muladd @inline function kernel(kernel::SchoenbergQuinticSplineKernel, r::Real, h)
-    q = r / h
-    q3 = (3 - q)
-    q2 = (2 - q)
-    q1 = (1 - q)
+@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuinticSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = q3^5
-    result = result - 6 * (q < 2) * q2^5
-    result = result + 15 * (q < 1) * q1^5
+    result = (3 - q)^5 - 6 * (q < 2) * (2 - q)^5 + 15 * (q < 1) * (1 - q)^5
 
-    # Zero out result if q >= 3
-    result = ifelse(q < 3, normalization_factor(kernel, h) * result, zero(result))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuinticSplineKernel,
-                                               r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
-    q3 = (3 - q)
-    q2 = (2 - q)
-    q1 = (1 - q)
+@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuinticSplineKernel,
+                                                    r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = -5 * q3^4
-    result = result + 30 * (q < 2) * q2^4
-    result = result - 75 * (q < 1) * q1^4
+    result = -5 * (3 - q)^4 + 30 * (q < 2) * (2 - q)^4 - 75 * (q < 1) * (1 - q)^4
 
-    # Zero out result if q >= 3
-    result = ifelse(q < 3, normalization_factor(kernel, h) * result * inner_deriv,
-                    zero(result))
+    kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv
 
-    return result
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::SchoenbergQuinticSplineKernel, h) = 3 * h
 
-@inline normalization_factor(::SchoenbergQuinticSplineKernel{1}, h) = 1 / (120 * h)
-@inline normalization_factor(::SchoenbergQuinticSplineKernel{2}, h) = 7 / (pi * h^2 * 478)
-@inline normalization_factor(::SchoenbergQuinticSplineKernel{3}, h) = 1 / (pi * h^3 * 120)
+@inline function normalization_factor(::SchoenbergQuinticSplineKernel{1}, h_inv)
+    return oftype(h_inv, 1 / (120)) * h_inv
+end
+
+@inline function normalization_factor(::SchoenbergQuinticSplineKernel{2}, h_inv)
+    return oftype(h_inv, 7 / (pi * 478)) * h_inv^2
+end
+
+@inline function normalization_factor(::SchoenbergQuinticSplineKernel{3}, h_inv)
+    return oftype(h_inv, 1 / (pi * 120)) * h_inv^3
+end
 
 abstract type AbstractWendlandKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
@@ -383,33 +433,26 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct WendlandC2Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::WendlandC2Kernel, r::Real, h)
-    q = r / h
-
-    result = (1 - q / 2)^4 * (2 * q + 1)
+@fastpow @inline function kernel_unsafe(kernel::WendlandC2Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * (1 - q / 2)^4 * (2 * q + 1)
 end
 
-@inline function kernel_deriv(kernel::WendlandC2Kernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
-
-    result = -5 * (1 - q / 2)^3 * q
+@inline function kernel_deriv_div_r_unsafe(kernel::WendlandC2Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2,
-                    normalization_factor(kernel, h) * result * inner_deriv, zero(q))
-
-    return result
+    return -5 * normalization_factor(kernel, h_inv) * (1 - q / 2)^3 * h_inv^2
 end
 
-# Note that `7 // 4` saves one instruction but is significantly slower on GPUs (for now)
-@inline normalization_factor(::WendlandC2Kernel{2}, h) = 7 / (pi * h^2 * 4)
-@inline normalization_factor(::WendlandC2Kernel{3}, h) = 21 / (pi * h^3 * 16)
+@inline function normalization_factor(::WendlandC2Kernel{2}, h_inv)
+    return oftype(h_inv, 7 / (4 * pi)) * h_inv^2
+end
+@inline function normalization_factor(::WendlandC2Kernel{3}, h_inv)
+    return oftype(h_inv, 21 / (16 * pi)) * h_inv^3
+end
 
 @doc raw"""
     WendlandC4Kernel{NDIMS}()
@@ -445,31 +488,30 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct WendlandC4Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::WendlandC4Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_unsafe(kernel::WendlandC4Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
     result = (1 - q / 2)^6 * (35 * q^2 / 12 + 3 * q + 1)
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @inline function kernel_deriv(kernel::WendlandC4Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC4Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    derivative = -7 * q / 3 * (2 + 5 * q) * (1 - q / 2)^5
+    result = oftype(r, -7 / 3) * (2 + 5 * q) * (1 - q / 2)^5 * h_inv^2
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h,
-                    zero(derivative))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@inline normalization_factor(::WendlandC4Kernel{2}, h) = 9 / (pi * h^2 * 4)
-@inline normalization_factor(::WendlandC4Kernel{3}, h) = 495 / (pi * h^3 * 256)
+@inline function normalization_factor(::WendlandC4Kernel{2}, h_inv)
+    return oftype(h_inv, 9 / (pi * 4)) * h_inv^2
+end
+@inline function normalization_factor(::WendlandC4Kernel{3}, h_inv)
+    return oftype(h_inv, 495 / (pi * 256)) * h_inv^3
+end
 
 @doc raw"""
     WendlandC6Kernel{NDIMS}()
@@ -505,31 +547,31 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct WendlandC6Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::WendlandC6Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_unsafe(kernel::WendlandC6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
     result = (1 - q / 2)^8 * (4 * q^3 + 25 * q^2 / 4 + 4 * q + 1)
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @muladd @inline function kernel_deriv(kernel::WendlandC6Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    derivative = -11 * q / 4 * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7
+    result = oftype(r, -11 / 4) * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7 * h_inv^2
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h,
-                    zero(derivative))
+    return normalization_factor(kernel, h_inv) * result
+end
 
-    return result
+@inline function normalization_factor(::WendlandC6Kernel{2}, h_inv)
+    return oftype(h_inv, 39 / (pi * 14)) * h_inv^2
 end
 
-@inline normalization_factor(::WendlandC6Kernel{2}, h) = 39 / (pi * h^2 * 14)
-@inline normalization_factor(::WendlandC6Kernel{3}, h) = 1365 / (pi * h^3 * 512)
+@inline function normalization_factor(::WendlandC6Kernel{3}, h_inv)
+    return oftype(h_inv, 1365 / (pi * 512)) * h_inv^3
+end
 
 @doc raw"""
     Poly6Kernel{NDIMS}()
@@ -541,7 +583,6 @@ especially in computer graphics contexts. It is defined as
 W(r, h) = \frac{1}{h^d} w(r/h)
 ```
 
-
 with
 
 ```math
@@ -569,34 +610,34 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct Poly6Kernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@inline function kernel(kernel::Poly6Kernel, r::Real, h)
-    q = r / h
-
-    result = (1 - q^2)^3
+@inline function kernel_unsafe(kernel::Poly6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 1
-    result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * (1 - q^2)^3
 end
 
-@inline function kernel_deriv(kernel::Poly6Kernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::Poly6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    result = -6 * q * (1 - q^2)^2
+    kernel_deriv = normalization_factor(kernel, h_inv) * -6 * q * (1 - q^2)^2 * h_inv
 
-    # Zero out result if q >= 1
-    result = ifelse(q < 1,
-                    result * normalization_factor(kernel, h) * inner_deriv, zero(q))
-    return result
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::Poly6Kernel, h) = h
 
-# Note that `315 // 64` saves one instruction but is significantly slower on GPUs (for now)
-@inline normalization_factor(::Poly6Kernel{2}, h) = 4 / (pi * h^2)
-@inline normalization_factor(::Poly6Kernel{3}, h) = 315 / (pi * h^3 * 64)
+@inline function normalization_factor(::Poly6Kernel{2}, h_inv)
+    return oftype(h_inv, 4 / (pi)) * h_inv^2
+end
+
+@inline function normalization_factor(::Poly6Kernel{3}, h_inv)
+    return oftype(h_inv, 315 / (64 * pi)) * h_inv^3
+end
 
 @doc raw"""
     SpikyKernel{NDIMS}()
@@ -634,38 +675,39 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct SpikyKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@inline function kernel(kernel::SpikyKernel, r::Real, h)
-    q = r / h
-
-    result = (1 - q)^3
-
-    # Zero out result if q >= 1
-    result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q))
+@inline function kernel_unsafe(kernel::SpikyKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    return result
+    return normalization_factor(kernel, h_inv) * (1 - q)^3
 end
 
-@inline function kernel_deriv(kernel::SpikyKernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::SpikyKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    result = -3 * (1 - q)^2
+    kernel_deriv = -3 * normalization_factor(kernel, h_inv) * (1 - q)^2 * h_inv
 
-    # Zero out result if q >= 1
-    result = ifelse(q < 1, result * normalization_factor(kernel, h) * inner_deriv, zero(q))
-
-    return result
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::SpikyKernel, h) = h
 
-@inline normalization_factor(::SpikyKernel{2}, h) = 10 / (pi * h^2)
-@inline normalization_factor(::SpikyKernel{3}, h) = 15 / (pi * h^3)
+@inline function normalization_factor(::SpikyKernel{2}, h_inv)
+    return oftype(h_inv, 10 / pi) * h_inv^2
+end
+
+@inline function normalization_factor(::SpikyKernel{3}, h_inv)
+    return oftype(h_inv, 15 / pi) * h_inv^3
+end
 
 @doc raw"""
     LaguerreGaussKernel{NDIMS}()
 
-Truncated Laguerre–Gauss kernel (fourth-order smoothing) as defined in
+Truncated Laguerre-Gauss kernel (fourth-order smoothing) as defined in
 [Wang2024](@cite). Its radial form uses ``q = r/h`` and is truncated at
 ``q = 2`` (compact support ``2h``):
 
@@ -716,51 +758,65 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct LaguerreGaussKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::LaguerreGaussKernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_unsafe(kernel::LaguerreGaussKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # polynomial part: 1 - s^2/2 + s^4/6
+    # Polynomial part
     poly = 1 - q^2 / 2 + q^4 / 6
 
-    # zero out for s ≥ 2
-    return ifelse(q < 2, normalization_factor(kernel, h) * poly * exp(-q^2), zero(q))
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
+
+    return normalization_factor(kernel, h_inv) * poly * exp_q2
 end
 
-@inline function kernel_deriv(kernel::LaguerreGaussKernel, r::Real, h)
-    invh = 1 / h
-    q = r * invh
+@inline function kernel_deriv_div_r_unsafe(kernel::LaguerreGaussKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
+
+    # dg/dq = (q / 3) * (-q^4 + 5q^2 - 9) * exp(-q^2), but we already divide by `r``
+    poly = (h_inv / 3) * ((-q^2 + 5) * q^2 - 9)
 
-    # dg/dq = (q/3)*(-q^4 + 5q^2 - 9) * exp(-q^2)
-    poly = ((-q^2 + 5) * q^2 - 9) * (q / 3)
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
 
-    return ifelse(q < 2, normalization_factor(kernel, h) * exp(-q^2) * poly * invh, zero(q))
+    return normalization_factor(kernel, h_inv) * poly * exp_q2 * h_inv
 end
 
 @inline compact_support(::LaguerreGaussKernel, h) = 2 * h
-# Original normalization factors as in Wang2024
-# @inline normalization_factor(::LaguerreGaussKernel{1}, h) = (8 / (5 * sqrt(pi))) / h
-# @inline normalization_factor(::LaguerreGaussKernel{2}, h) = (3 / (pi)) / (h^2)
-# @inline normalization_factor(::LaguerreGaussKernel{3}, h) = (8 / (pi^(3 // 2))) / (h^3)
+
+# Original normalization factors as in Wang2024:
+# 1D: (8 / (5 * sqrt(pi))) / h
+# 2D: (3 / (pi)) / (h^2)
+# 3D: (8 / (pi^(3 / 2))) / (h^3)
 
 # Renormalized to the truncated integral over [0,2h]
-@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h)
-    # C = 1/(2*(7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4))
-    C = oftype(h, 0.65428780253539)
-    # C' = C/h
-    return C / h
+@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h_inv)
+    # C = 1 / (2 * ((7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4)))
+    C = oftype(h_inv, 0.65428780253539)
+    # C' = C / h
+    return C * h_inv
 end
 
-@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h)
+@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h_inv)
     # C = 2 * pi * (5 - 17 * exp(-4)) / 12
-    C = oftype(h, 2.454963094351984)
+    C_inv = oftype(h_inv, 0.4073380990128333)
     # C' = 1 / (h^2 * C)
-    return 1 / (h^2 * C)
+    return C_inv * h_inv^2
 end
 
-@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h)
+@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h_inv)
     # C = (7 * sqrt(pi) / 32) * erf(2) - (77 / 24) * exp(-4)
-    C = oftype(h, 0.3271479336905373)
+    # C_ = 1 / (4 * pi * C)
+    C_ = oftype(h_inv, 0.24324613836999895)
 
-    # 4*pi cannot be pulled into C otherwise the test fails because of differences in rounding
-    return 1 / (C * 4 * pi * h^3)
+    # C' = 1 / (4 * pi * h^3 * C) = C_ / (h^3)
+    return C_ * h_inv^3
 end
diff --git a/src/preprocessing/particle_packing/rhs.jl b/src/preprocessing/particle_packing/rhs.jl
index 77cf460f5b..807fba072f 100644
--- a/src/preprocessing/particle_packing/rhs.jl
+++ b/src/preprocessing/particle_packing/rhs.jl
@@ -5,11 +5,26 @@ function interact!(dv, v_particle_system, u_particle_system,
     system_coords = current_coordinates(u_particle_system, system)
     neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff
     foreach_point_neighbor(system, neighbor_system, system_coords, neighbor_coords,
                            semi) do particle, neighbor, pos_diff, distance
-        # Only consider particles with a distance > 0. See `src/general/smoothing_kernels.jl` for more details.
-        distance^2 < eps(initial_smoothing_length(system)^2) && return
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(system, pos_diff,
+                                                   distance, particle)
 
         rho_a = system.initial_condition.density[particle]
         rho_b = neighbor_system.initial_condition.density[neighbor]
@@ -22,8 +37,6 @@ function interact!(dv, v_particle_system, u_particle_system,
 
         p_b = system.background_pressure
 
-        grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle)
-
         # This vanishes for uniform particle distributions
         dv_repulsive_pressure = -(2 / m_a) * V_a * V_b * p_b * grad_kernel
 
diff --git a/src/schemes/boundary/open_boundary/rhs.jl b/src/schemes/boundary/open_boundary/rhs.jl
index 85fcc7dfb8..971f3c8a83 100644
--- a/src/schemes/boundary/open_boundary/rhs.jl
+++ b/src/schemes/boundary/open_boundary/rhs.jl
@@ -10,6 +10,15 @@ function interact!(dv, v_particle_system, u_particle_system,
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(particle_system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff
     foreach_point_neighbor(particle_system, neighbor_system,
                            system_coords, neighbor_system_coords, semi;
@@ -17,14 +26,21 @@ function interact!(dv, v_particle_system, u_particle_system,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(particle_system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                   distance, particle)
+
         # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
         # in bounds of the respective system. For performance reasons, we use `@inbounds`
         # in this hot loop to avoid bounds checking when extracting particle quantities.
         rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
         rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
 
-        grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
-
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
 
diff --git a/src/schemes/boundary/wall_boundary/rhs.jl b/src/schemes/boundary/wall_boundary/rhs.jl
index e0553764ef..5d9aef7235 100644
--- a/src/schemes/boundary/wall_boundary/rhs.jl
+++ b/src/schemes/boundary/wall_boundary/rhs.jl
@@ -19,9 +19,27 @@ function interact!(dv, v_particle_system, u_particle_system,
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(particle_system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff.
     foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords,
                            semi) do particle, neighbor, pos_diff, distance
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(particle_system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                   distance, particle)
+
         m_b = hydrodynamic_mass(neighbor_system, neighbor)
 
         rho_a = current_density(v_particle_system, particle_system, particle)
@@ -30,8 +48,6 @@ function interact!(dv, v_particle_system, u_particle_system,
         v_diff = current_velocity(v_particle_system, particle_system, particle) -
                  current_velocity(v_neighbor_system, neighbor_system, neighbor)
 
-        grad_kernel = smoothing_kernel_grad(boundary_model, pos_diff, distance, particle)
-
         continuity_equation!(dv, density_calculator(neighbor_system),
                              m_b, rho_a, rho_b, v_diff, grad_kernel, particle)
     end
diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl
index b91279702b..a6c98a5d64 100644
--- a/src/schemes/fluid/entropically_damped_sph/rhs.jl
+++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl
@@ -11,6 +11,15 @@ function interact!(dv, v_particle_system, u_particle_system,
     surface_tension_a = surface_tension_model(particle_system)
     surface_tension_b = surface_tension_model(neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(particle_system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff
     foreach_point_neighbor(particle_system, neighbor_system,
                            system_coords, neighbor_coords, semi;
@@ -18,6 +27,15 @@ function interact!(dv, v_particle_system, u_particle_system,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(particle_system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                   distance, particle)
+
         # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
         # in bounds of the respective system. For performance reasons, we use `@inbounds`
         # in this hot loop to avoid bounds checking when extracting particle quantities.
@@ -38,8 +56,6 @@ function interact!(dv, v_particle_system, u_particle_system,
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
 
-        grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
-
         dv_pressure = pressure_acceleration(particle_system, neighbor_system,
                                             particle, neighbor,
                                             m_a, m_b, p_a - p_avg, p_b - p_avg, rho_a,
diff --git a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
index ca7c4b2c35..8cebf8d20c 100644
--- a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
+++ b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
@@ -10,6 +10,15 @@ function interact!(dv, v_particle_system, u_particle_system,
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(particle_system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff.
     foreach_point_neighbor(particle_system, neighbor_system,
                            system_coords, neighbor_system_coords, semi;
@@ -17,14 +26,21 @@ function interact!(dv, v_particle_system, u_particle_system,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(particle_system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                   distance, particle)
+
         # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
         # in bounds of the respective system. For performance reasons, we use `@inbounds`
         # in this hot loop to avoid bounds checking when extracting particle quantities.
         rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
         rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
 
-        grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
-
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
 
diff --git a/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl b/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl
index 9837eb1c69..be609e5576 100644
--- a/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/density_diffusion.jl
@@ -221,8 +221,11 @@ end
                                                                        OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang}},
                                                 grad_kernel)
     # Density diffusion terms are all zero for distance zero.
-    # See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return
+    # If `skip_zero_distance` is `true`, we can assume that this function isn't called
+    # for distance zero because these neighbors have already been skipped.
+    if !skip_zero_distance(particle_system)
+        distance^2 < eps(initial_smoothing_length(particle_system)^2) && return
+    end
 
     # Since this is one of the most performance critical functions, using fast divisions
     # here gives a significant speedup on GPUs.
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index f70891eae3..96915d8b14 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -19,6 +19,15 @@ function interact!(dv, v_particle_system, u_particle_system,
     # the following code and the two other lines below that are marked as "debug example".
     # debug_array = zeros(ndims(particle_system), nparticles(particle_system))
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(particle_system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff
     foreach_point_neighbor(particle_system, neighbor_system,
                            system_coords, neighbor_system_coords, semi;
@@ -26,6 +35,15 @@ function interact!(dv, v_particle_system, u_particle_system,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(particle_system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                   distance, particle)
+
         # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
         # in bounds of the respective system. For performance reasons, we use `@inbounds`
         # in this hot loop to avoid bounds checking when extracting particle quantities.
@@ -39,8 +57,6 @@ function interact!(dv, v_particle_system, u_particle_system,
          surface_tension_correction) = free_surface_correction(correction, particle_system,
                                                                rho_mean)
 
-        grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
-
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
 
diff --git a/src/schemes/structure/discrete_element_method/rhs.jl b/src/schemes/structure/discrete_element_method/rhs.jl
index 0fd327b10c..ee16f2a63d 100644
--- a/src/schemes/structure/discrete_element_method/rhs.jl
+++ b/src/schemes/structure/discrete_element_method/rhs.jl
@@ -6,14 +6,22 @@ function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system,
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the radius `r`, `distance^2` is in
+    # the order of `r^2`, so we need to check `distance < sqrt(eps(r^2))`.
+    # Note that `sqrt(eps(r^2)) != eps(r)`.
+    r = maximum(particle_system.radius)
+    almostzero = sqrt(eps(r^2))
+
     foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords,
                            semi;
                            points=each_integrated_particle(particle_system)) do particle,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
-        # See `src/general/smoothing_kernels.jl` for more details
-        distance^2 < eps(first(particle_system.radius)^2) && return
+        distance < almostzero && return
 
         # Retrieve particle properties
         m_a = particle_system.mass[particle]
diff --git a/src/schemes/structure/structure.jl b/src/schemes/structure/structure.jl
index 5fe6d08f1d..ff872c04fc 100644
--- a/src/schemes/structure/structure.jl
+++ b/src/schemes/structure/structure.jl
@@ -17,10 +17,8 @@ end
     end
 end
 
-function interact_structure_fluid!(dv, v_particle_system,
-                                   u_particle_system,
-                                   v_neighbor_system,
-                                   u_neighbor_system,
+function interact_structure_fluid!(dv, v_particle_system, u_particle_system,
+                                   v_neighbor_system, u_neighbor_system,
                                    particle_system,
                                    neighbor_system::AbstractFluidSystem,
                                    semi)
@@ -28,18 +26,32 @@ function interact_structure_fluid!(dv, v_particle_system,
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(neighbor_system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff.
-    foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords,
-                           semi;
+    foreach_point_neighbor(particle_system, neighbor_system,
+                           system_coords, neighbor_coords, semi;
                            points=each_integrated_particle(particle_system)) do particle,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
-        # Only consider particles with a distance > 0.
-        # See `src/general/smoothing_kernels.jl` for more details.
-        distance^2 < eps(initial_smoothing_length(particle_system)^2) && return
-
-        grad_kernel = smoothing_kernel_grad(neighbor_system, pos_diff, distance, neighbor)
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(neighbor_system) && distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        # Note that we use the `neighbor_system` to compute the kernel gradient
+        # to obtain the same force as in the fluid-structure interaction.
+        grad_kernel = smoothing_kernel_grad_unsafe(neighbor_system, pos_diff,
+                                                   distance, neighbor)
 
         m_b = hydrodynamic_mass(neighbor_system, neighbor)
 
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
index a0fb522b4d..3383929c28 100644
--- a/src/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -20,22 +20,33 @@ end
     # Everything here is done in the initial coordinates
     system_coords = initial_coordinates(system)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff.
     # For structure-structure interaction, this has to happen in the initial coordinates.
     foreach_point_neighbor(system, system, system_coords, system_coords, semi;
                            points=each_integrated_particle(system)) do particle, neighbor,
                                                                        initial_pos_diff,
                                                                        initial_distance
-        # Only consider particles with a distance > 0.
-        # See `src/general/smoothing_kernels.jl` for more details.
-        initial_distance^2 < eps(initial_smoothing_length(system)^2) && return
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(system) && initial_distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
+                                                   initial_distance, particle)
 
         rho_a = @inbounds system.material_density[particle]
         rho_b = @inbounds system.material_density[neighbor]
 
-        grad_kernel = smoothing_kernel_grad(system, initial_pos_diff,
-                                            initial_distance, particle)
-
         m_a = @inbounds system.mass[particle]
         m_b = @inbounds system.mass[neighbor]
 
diff --git a/src/schemes/structure/total_lagrangian_sph/system.jl b/src/schemes/structure/total_lagrangian_sph/system.jl
index cac4ed1f12..b4fb02f31a 100644
--- a/src/schemes/structure/total_lagrangian_sph/system.jl
+++ b/src/schemes/structure/total_lagrangian_sph/system.jl
@@ -475,13 +475,27 @@ end
     # Reset deformation gradient
     set_zero!(deformation_grad)
 
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
+    # and the density diffusion divide by zero.
+    # To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
+    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
+    # Note that `sqrt(eps(h^2)) != eps(h)`.
+    h = initial_smoothing_length(system)
+    almostzero = sqrt(eps(h^2))
+
     # Loop over all pairs of particles and neighbors within the kernel cutoff
     initial_coords = initial_coordinates(system)
     foreach_point_neighbor(system, system, initial_coords, initial_coords,
                            semi) do particle, neighbor, initial_pos_diff, initial_distance
-        # Only consider particles with a distance > 0.
-        # See `src/general/smoothing_kernels.jl` for more details.
-        initial_distance^2 < eps(initial_smoothing_length(system)^2) && return
+        # Skip neighbors with the same position because the kernel gradient is zero.
+        # Note that `return` only exits the closure, i.e., skips the current neighbor.
+        skip_zero_distance(system) && initial_distance < almostzero && return
+
+        # Now that we know that `distance` is not zero, we can safely call the unsafe
+        # version of the kernel gradient to avoid redundant zero checks.
+        grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
+                                                   initial_distance, particle)
 
         volume = @inbounds mass[neighbor] / material_density[neighbor]
         pos_diff_ = @inbounds current_coords(system, particle) -
@@ -489,9 +503,6 @@ end
         # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
         pos_diff = convert.(eltype(system), pos_diff_)
 
-        grad_kernel = smoothing_kernel_grad(system, initial_pos_diff,
-                                            initial_distance, particle)
-
         # Multiply by L_{0a}
         L = @inbounds correction_matrix(system, particle)
 
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 3eae7b839b..466297b355 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -331,10 +331,8 @@ end
                                                                      fluid_density_calculator=SummationDensity(),
                                                                      maxiters=38, # 38 time steps on CPU
                                                                      clip_negative_pressure=true),
-                # Broken due to https://github.com/JuliaGPU/CUDA.jl/issues/2681
-                # and https://github.com/JuliaGPU/Metal.jl/issues/550.
-                # "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1,
-                #                                                smoothing_kernel=SchoenbergQuarticSplineKernel{2}()),
+                "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1,
+                                                               smoothing_kernel=SchoenbergQuarticSplineKernel{2}()),
                 "WCSPH with SchoenbergQuinticSplineKernel" => (smoothing_length=1.1,
                                                                smoothing_kernel=SchoenbergQuinticSplineKernel{2}()),
                 "WCSPH with WendlandC2Kernel" => (smoothing_length=1.5,
diff --git a/test/general/smoothing_kernels.jl b/test/general/smoothing_kernels.jl
index fdd7f9a142..e112d9562b 100644
--- a/test/general/smoothing_kernels.jl
+++ b/test/general/smoothing_kernels.jl
@@ -1,19 +1,29 @@
 @testset verbose=true "Smoothing Kernels" begin
     # Don't show all kernel tests in the final overview
     @testset verbose=false "Integral" begin
-        # All smoothing kernels should integrate to something close to 1
+        # All smoothing kernels should integrate to something close to 1.
+        # We integrate slightly beyond the compact support to verify that the kernel is
+        # correctly evaluating to zero there.
         function integrate_kernel_2d(smk)
             integral_2d_radial,
             _ = quadgk(r -> r * TrixiParticles.kernel(smk, r, 1.0), 0,
-                       TrixiParticles.compact_support(smk, 1.0),
+                       TrixiParticles.compact_support(smk, 1.0) * 1.1,
                        rtol=1e-15)
             return 2 * pi * integral_2d_radial
         end
 
+        function integrate_kernel_1d(smk)
+            integral_1d_half,
+            _ = quadgk(r -> TrixiParticles.kernel(smk, r, 1.0), 0,
+                       TrixiParticles.compact_support(smk, 1.0) * 1.1,
+                       rtol=1e-15)
+            return 2 * integral_1d_half
+        end
+
         function integrate_kernel_3d(smk)
             integral_3d_radial,
             _ = quadgk(r -> r^2 * TrixiParticles.kernel(smk, r, 1.0), 0,
-                       TrixiParticles.compact_support(smk, 1.0),
+                       TrixiParticles.compact_support(smk, 1.0) * 1.1,
                        rtol=1e-15)
             return 4 * pi * integral_3d_radial
         end
@@ -41,6 +51,13 @@
             LaguerreGaussKernel
         ]
 
+        kernels_1d = [
+            SchoenbergCubicSplineKernel,
+            SchoenbergQuarticSplineKernel,
+            SchoenbergQuinticSplineKernel,
+            LaguerreGaussKernel
+        ]
+
         @testset "$kernel" for kernel in kernels
             # The integral should be 1 for all kernels
             error_2d = abs(integrate_kernel_2d(kernel{2}()) - 1.0)
@@ -48,6 +65,11 @@
 
             @test error_2d <= 1e-15
             @test error_3d <= 1e-15
+
+            if kernel in kernels_1d
+                error_1d = abs(integrate_kernel_1d(kernel{1}()) - 1.0)
+                @test error_1d <= 1e-15
+            end
         end
     end
 
@@ -67,16 +89,31 @@
             LaguerreGaussKernel
         ]
 
+        kernels_1d = [
+            SchoenbergCubicSplineKernel,
+            SchoenbergQuarticSplineKernel,
+            SchoenbergQuinticSplineKernel,
+            LaguerreGaussKernel
+        ]
+
         # Test 4 different smoothing lengths
         smoothing_lengths = 0.25:0.25:1
 
-        @testset "$kernel_type" for kernel_type in kernels
-            for ndims in 2:3
+        for ndims in 1:3
+            kernels_ndims = ndims == 1 ? kernels_1d : kernels
+
+            @testset "$kernel_type{$ndims}" for kernel_type in kernels_ndims
                 kernel_ = kernel_type{ndims}()
 
                 for h in smoothing_lengths
+                    compact_support_ = TrixiParticles.compact_support(kernel_, h)
                     # Test 11 different radii
-                    radii = 0:(0.1h):(h + eps())
+                    radii = 0:(0.1 * compact_support_):(compact_support_ * 1.01)
+
+                    if kernel_ isa SpikyKernel
+                        # The Spiky kernel is not differentiable at r=0
+                        radii = radii[2:end]
+                    end
 
                     for r in radii
                         # Automatic differentation of `kernel`
@@ -111,11 +148,20 @@
             LaguerreGaussKernel
         ]
 
+        kernels_1d = [
+            SchoenbergCubicSplineKernel,
+            SchoenbergQuarticSplineKernel,
+            SchoenbergQuinticSplineKernel,
+            LaguerreGaussKernel
+        ]
+
         # Test different smoothing length types
         smoothing_lengths = (0.5, 0.5f0)
 
-        @testset "$kernel_type" for kernel_type in kernels
-            for ndims in 2:3
+        for ndims in 1:3
+            kernels_ndims = ndims == 1 ? kernels_1d : kernels
+
+            @testset "$kernel_type{$ndims}" for kernel_type in kernels_ndims
                 kernel_ = kernel_type{ndims}()
 
                 for h in smoothing_lengths
diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl
index 69f4a920c4..2b145859df 100644
--- a/test/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -93,10 +93,11 @@
             TrixiParticles.each_integrated_particle(::MockSystem) = each_integrated_particle
             TrixiParticles.smoothing_length(::MockSystem, _) = eps()
 
-            function TrixiParticles.add_acceleration!(_, _, ::MockSystem)
-                return nothing
+            function TrixiParticles.smoothing_kernel_grad_unsafe(::MockSystem,
+                                                                 pos_diff, distance,
+                                                                 particle)
+                return kernel_deriv * pos_diff / distance
             end
-            TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) = kernel_deriv
             TrixiParticles.compact_support(::MockSystem, ::MockSystem) = 1000.0
             function TrixiParticles.current_coords(system::MockSystem, particle)
                 return TrixiParticles.current_coords(initial_coordinates, system, particle)
diff --git a/test/systems/tlsph_system.jl b/test/systems/tlsph_system.jl
index 1e7b1edff5..3bc3ea2ceb 100644
--- a/test/systems/tlsph_system.jl
+++ b/test/systems/tlsph_system.jl
@@ -184,10 +184,11 @@
 
                 Base.getindex(::Val{:mock_material_density}, ::Int64) = density
 
-                function TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _)
-                    return kernel_derivative
+                function TrixiParticles.smoothing_kernel_grad_unsafe(::Val{:mock_system_tensor},
+                                                                     pos_diff, distance,
+                                                                     particle)
+                    return kernel_derivative * pos_diff / distance
                 end
-                Base.eps(::Type{Val{:mock_smoothing_length}}) = eps()
                 semi = DummySemidiscretization()
 
                 # Compute deformation gradient