From 58d3ddef297ed94816295bda6a159375b82fec2d Mon Sep 17 00:00:00 2001
From: Jeremy <jkarpowski@Jeremys-MacBook-Pro.local>
Date: Thu, 19 Mar 2026 18:18:44 +0100
Subject: [PATCH 1/7] enabled MPI parallelism for viscous problem

---
 ...semidiscretization_hyperbolic_parabolic.jl |    8 +-
 src/solvers/dgsem_p4est/dg.jl                 |  141 +-
 src/solvers/dgsem_p4est/dg_2d_parabolic.jl    |   36 +-
 .../dgsem_p4est/dg_3d_parabolic_parallel.jl   | 2038 +++++++++++++++++
 src/solvers/dgsem_p4est/dg_parallel.jl        |   23 +
 src/solvers/dgsem_tree/dg_3d_parabolic.jl     |    2 +-
 6 files changed, 2182 insertions(+), 66 deletions(-)
 create mode 100644 src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl

diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index c730439017c..ff181897740 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -173,6 +173,12 @@ function Base.show(io::IO, semi::SemidiscretizationHyperbolicParabolic)
         print(io, key)
     end
     print(io, "))")
+    print(io, ", cacheParabolic(")
+    for (idx, key) in enumerate(keys(semi.cache_parabolic))
+        idx > 1 && print(io, " ")
+        print(io, key)
+    end
+    print(io, "))")
     return nothing
 end
 
@@ -360,7 +366,7 @@ function semidiscretize(semi::SemidiscretizationHyperbolicParabolic, tspan,
         # We could also construct an `ODEFunction` explicitly without the Jacobian here,
         # but we stick to the lean direct in-place function `rhs_parabolic!` and
         # let OrdinaryDiffEq.jl handle the rest
-        return SplitODEProblem{iip}(rhs_parabolic!, rhs!, u0_ode, tspan, semi)
+        return SplitODEProblem{iip}(x, rhs!, u0_ode, tspan, semi)
     end
 end
 
diff --git a/src/solvers/dgsem_p4est/dg.jl b/src/solvers/dgsem_p4est/dg.jl
index 12e13f0d239..9f0670ee2b1 100644
--- a/src/solvers/dgsem_p4est/dg.jl
+++ b/src/solvers/dgsem_p4est/dg.jl
@@ -8,66 +8,110 @@
 # This method is called when a SemidiscretizationHyperbolic is constructed.
 # It constructs the basic `cache` used throughout the simulation to compute
 # the RHS etc.
-function create_cache(mesh::P4estMesh, equations::AbstractEquations, dg::DG, ::Any,
-                      ::Type{uEltype}) where {uEltype <: Real}
-    # Make sure to balance the `p4est` before creating any containers
-    # in case someone has tampered with the `p4est` after creating the mesh
-    balance!(mesh)
+    function create_cache(mesh::P4estMesh, equations::AbstractEquations, dg::DG, ::Any,
+                        ::Type{uEltype}) where {uEltype <: Real}
+        # Make sure to balance the `p4est` before creating any containers
+        # in case someone has tampered with the `p4est` after creating the mesh
+        balance!(mesh)
+
+        elements = init_elements(mesh, equations, dg.basis, uEltype)
+        interfaces = init_interfaces(mesh, equations, dg.basis, elements)
+        boundaries = init_boundaries(mesh, equations, dg.basis, elements)
+        mortars = init_mortars(mesh, equations, dg.basis, elements)
+
+        # Container cache
+        cache = (; elements, interfaces, boundaries, mortars)
+
+        # Add Volume-Integral cache
+        cache = (; cache...,
+                create_cache(mesh, equations, dg.volume_integral, dg, cache, uEltype)...)
+        # Add Mortar cache
+        cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
+
+        return cache
+    end
 
-    elements = init_elements(mesh, equations, dg.basis, uEltype)
-    interfaces = init_interfaces(mesh, equations, dg.basis, elements)
-    boundaries = init_boundaries(mesh, equations, dg.basis, elements)
-    mortars = init_mortars(mesh, equations, dg.basis, elements)
+    # This method is called when a SemidiscretizationHyperbolic is constructed.
+    # It constructs the basic `cache` used throughout the simulation to compute
+    # the RHS etc.
+    function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG, ::Any,
+                        ::Type{uEltype}) where {uEltype <: Real}
+        # Make sure to balance the `p4est` before creating any containers
+        # in case someone has tampered with the `p4est` after creating the mesh
+        balance!(mesh.parent)
+
+        elements_parent = init_elements(mesh.parent, equations, dg.basis, uEltype)
+        interfaces_parent = init_interfaces(mesh.parent, equations, dg.basis,
+                                            elements_parent)
+        boundaries_parent = init_boundaries(mesh.parent, equations, dg.basis,
+                                            elements_parent)
+        mortars_parent = init_mortars(mesh.parent, equations, dg.basis, elements_parent)
+
+        # Extract data for views.
+        elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
+                                                                                                interfaces_parent,
+                                                                                                boundaries_parent,
+                                                                                                mortars_parent,
+                                                                                                mesh,
+                                                                                                equations,
+                                                                                                dg,
+                                                                                                uEltype)
+
+        cache = (; elements, interfaces, boundaries, mortars, neighbor_ids_parent)
+
+        # Add Volume-Integral cache
+        cache = (; cache...,
+                create_cache(mesh, equations, dg.volume_integral, dg, uEltype)...)
+        # Add Mortar cache
+        cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
+
+        return cache
+    end
 
-    # Container cache
-    cache = (; elements, interfaces, boundaries, mortars)
+function create_cache_parabolic(mesh::P4estMesh,
+                                equations_hyperbolic::AbstractEquations,
+                                dg::DG,  n_elements, uEltype)
+    # Keep consistent with the baseline p4est cache
+    balance!(mesh)
 
-    # Add Volume-Integral cache
-    cache = (; cache...,
-             create_cache(mesh, equations, dg.volume_integral, dg, cache, uEltype)...)
-    # Add Mortar cache
-    cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
+    elements   = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
+    interfaces = init_interfaces(mesh, equations_hyperbolic, dg.basis, elements)
+    boundaries = init_boundaries(mesh, equations_hyperbolic, dg.basis, elements)
+    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
+                                                  nnodes(dg), n_elements,
+                                                  uEltype)
 
-    return cache
+    return (; elements, interfaces, boundaries, viscous_container)
 end
 
-# This method is called when a SemidiscretizationHyperbolic is constructed.
-# It constructs the basic `cache` used throughout the simulation to compute
-# the RHS etc.
-function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG, ::Any,
-                      ::Type{uEltype}) where {uEltype <: Real}
-    # Make sure to balance the `p4est` before creating any containers
-    # in case someone has tampered with the `p4est` after creating the mesh
+function create_cache_parabolic(mesh::P4estMeshView,
+                                equations_hyperbolic::AbstractEquations,
+                                dg::DG, n_elements, uEltype)
+    
+
     balance!(mesh.parent)
 
-    elements_parent = init_elements(mesh.parent, equations, dg.basis, uEltype)
-    interfaces_parent = init_interfaces(mesh.parent, equations, dg.basis,
+    elements_parent   = init_elements(mesh.parent, equations_hyperbolic, dg.basis, uEltype)
+    interfaces_parent = init_interfaces(mesh.parent, equations_hyperbolic, dg.basis,
                                         elements_parent)
-    boundaries_parent = init_boundaries(mesh.parent, equations, dg.basis,
+    boundaries_parent = init_boundaries(mesh.parent, equations_hyperbolic, dg.basis,
                                         elements_parent)
-    mortars_parent = init_mortars(mesh.parent, equations, dg.basis, elements_parent)
-
-    # Extract data for views.
-    elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
-                                                                                             interfaces_parent,
-                                                                                             boundaries_parent,
-                                                                                             mortars_parent,
-                                                                                             mesh,
-                                                                                             equations,
-                                                                                             dg,
-                                                                                             uEltype)
-
-    cache = (; elements, interfaces, boundaries, mortars, neighbor_ids_parent)
-
-    # Add Volume-Integral cache
-    cache = (; cache...,
-             create_cache(mesh, equations, dg.volume_integral, dg, uEltype)...)
-    # Add Mortar cache
-    cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
-
-    return cache
+    mortars_parent    = init_mortars(mesh.parent, equations_hyperbolic, dg.basis,
+                                     elements_parent)
+
+    elements, interfaces, boundaries, mortars, neighbor_ids_parent =
+        extract_p4est_mesh_view(elements_parent, interfaces_parent,
+                                boundaries_parent, mortars_parent,
+                                mesh, equations_hyperbolic, dg, uEltype)
+
+    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
+                                                  nnodes(dg), n_elements,
+                                                  uEltype)
+
+    return (; elements, interfaces, boundaries, neighbor_ids_parent, viscous_container)
 end
 
+
 # Extract outward-pointing normal direction
 # (contravariant vector ±Ja^i, i = index)
 # Note that this vector is not normalized
@@ -91,6 +135,7 @@ include("dg_2d_parabolic.jl")
 
 include("dg_3d.jl")
 include("dg_3d_parabolic.jl")
+include("dg_3d_parabolic_parallel.jl")
 include("dg_parallel.jl")
 
 # Subcell limiters
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 35372eecea7..4caba61d369 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -33,23 +33,27 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     @unpack viscous_container = cache_parabolic
     @unpack u_transformed, gradients, flux_viscous = viscous_container
 
+    cache_p = (; cache...,
+            elements   = cache_parabolic.elements,
+            interfaces = cache_parabolic.interfaces,
+            boundaries = cache_parabolic.boundaries)
     # Convert conservative variables to a form more suitable for viscous flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
-                             dg, cache)
+                             dg, cache_p)
     end
 
     # Compute the gradients of the transformed variables
     @trixi_timeit timer() "calculate gradient" begin
         calc_gradient!(gradients, u_transformed, t, mesh,
                        equations_parabolic, boundary_conditions_parabolic,
-                       dg, parabolic_scheme, cache)
+                       dg, parabolic_scheme, cache_p)
     end
 
     # Compute and store the viscous fluxes
     @trixi_timeit timer() "calculate viscous fluxes" begin
         calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
-                             equations_parabolic, dg, cache)
+                             equations_parabolic, dg, cache_p)
     end
 
     # The remainder of this function is essentially a regular rhs! for parabolic
@@ -65,41 +69,41 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # need to interpolate solutions *and* gradients to the surfaces.
 
     # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
+    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache_p)
 
     # Calculate volume integral
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache)
+        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache_p)
     end
 
     # Prolong solution to interfaces.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
     end
 
     # Calculate interface fluxes
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                             equations_parabolic, dg, parabolic_scheme, cache)
+        calc_interface_flux!(cache_p.elements.surface_flux_values, mesh,
+                             equations_parabolic, dg, parabolic_scheme, cache_p)
     end
 
     # Prolong viscous fluxes to boundaries.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
     end
 
     # Calculate boundary fluxes.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux_divergence!(cache, t,
+        calc_boundary_flux_divergence!(cache_p, t,
                                        boundary_conditions_parabolic, mesh,
                                        equations_parabolic,
                                        dg.surface_integral, dg)
@@ -108,33 +112,33 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Prolong solution to mortars.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars_divergence!(cache, flux_viscous, mesh, equations_parabolic,
+        prolong2mortars_divergence!(cache_p, flux_viscous, mesh, equations_parabolic,
                                     dg.mortar, dg)
     end
 
     # Calculate mortar fluxes.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "mortar flux" begin
-        calc_mortar_flux_divergence!(cache.elements.surface_flux_values,
+        calc_mortar_flux_divergence!(cache_p.elements.surface_flux_values,
                                      mesh, equations_parabolic, dg.mortar,
-                                     dg, parabolic_scheme, cache)
+                                     dg, parabolic_scheme, cache_p)
     end
 
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
         calc_surface_integral!(du, u, mesh, equations_parabolic,
-                               dg.surface_integral, dg, cache)
+                               dg.surface_integral, dg, cache_p)
     end
 
     # Apply Jacobian from mapping to reference element
     @trixi_timeit timer() "Jacobian" begin
-        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache)
+        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache_p)
     end
 
     @trixi_timeit timer() "source terms parabolic" begin
         calc_sources_parabolic!(du, u, gradients, t, source_terms_parabolic,
-                                equations_parabolic, dg, cache)
+                                equations_parabolic, dg, cache_p)
     end
 
     return nothing
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
new file mode 100644
index 00000000000..7447d67a6ae
--- /dev/null
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
@@ -0,0 +1,2038 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+
+
+
+function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                        equations_parabolic::AbstractEquationsParabolic,
+                        boundary_conditions_parabolic, source_terms_parabolic,
+                        dg::DG, parabolic_scheme, cache, cache_parabolic)
+    @unpack viscous_container = cache_parabolic
+    @unpack u_transformed, gradients, flux_viscous = viscous_container
+
+    cache_p = (; cache...,
+                elements   = cache_parabolic.elements,
+                interfaces = cache_parabolic.interfaces,
+                boundaries = cache_parabolic.boundaries)
+
+    #
+    # Stage 0: local variable transform
+    #
+    @trixi_timeit timer() "transform variables" begin
+        transform_variables!(u_transformed, u, mesh, equations_parabolic,
+                             dg, cache_p)
+    end
+
+    #
+    # Stage 1: gradient computation
+    #
+
+    # Start gradient-stage MPI receive
+    @trixi_timeit timer() "start MPI receive gradient" begin
+        start_mpi_receive!(cache_p.mpi_cache)
+    end
+
+    # Prolong transformed variables to MPI mortars
+    @trixi_timeit timer() "prolong2mpimortars gradient" begin
+        prolong2mpimortars!(cache_p, u_transformed, mesh, equations_parabolic,
+                            dg.mortar, dg)
+    end
+
+    # Prolong transformed variables to MPI interfaces
+    @trixi_timeit timer() "prolong2mpiinterfaces gradient" begin
+        prolong2mpiinterfaces!(cache_p, u_transformed, mesh, equations_parabolic,
+                               dg.surface_integral, dg)
+    end
+
+    # Start gradient-stage MPI send
+    @trixi_timeit timer() "start MPI send gradient" begin
+        start_mpi_send!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # Local gradient computation
+    @trixi_timeit timer() "calculate gradient local" begin
+        calc_gradient!(gradients, u_transformed, t, mesh,
+                       equations_parabolic, boundary_conditions_parabolic,
+                       dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish gradient-stage MPI receive
+    @trixi_timeit timer() "finish MPI receive gradient" begin
+        finish_mpi_receive!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # MPI interface fluxes for gradient stage
+    @trixi_timeit timer() "MPI interface flux gradient" begin
+        calc_mpi_interface_flux_gradient!(cache_p.elements.surface_flux_values,
+                                          mesh, equations_parabolic,
+                                          dg, parabolic_scheme, cache_p)
+    end
+
+    # MPI mortar fluxes for gradient stage
+    @trixi_timeit timer() "MPI mortar flux gradient" begin
+        calc_mpi_mortar_flux_gradient!(cache_p.elements.surface_flux_values,
+                                       mesh, equations_parabolic, dg.mortar,
+                                       dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish gradient-stage MPI send
+    @trixi_timeit timer() "finish MPI send gradient" begin
+        finish_mpi_send!(cache_p.mpi_cache)
+    end
+
+    #
+    # Stage 2: local viscous flux construction
+    #
+    @trixi_timeit timer() "calculate viscous fluxes" begin
+        calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
+                             equations_parabolic, dg, cache_p)
+    end
+
+    #
+    # Stage 3: divergence of viscous fluxes
+    #
+
+    # Start divergence-stage MPI receive
+    @trixi_timeit timer() "start MPI receive divergence" begin
+        start_mpi_receive!(cache_p.mpi_cache)
+    end
+
+    # Reset du
+    @trixi_timeit timer() "reset ∂u/∂t" begin
+        set_zero!(du, dg, cache_p)
+    end
+
+    # Local volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # Prolong viscous fluxes to MPI mortars
+    @trixi_timeit timer() "prolong2mpimortars divergence" begin
+        prolong2mpimortars_divergence!(cache_p, flux_viscous, mesh, equations_parabolic,
+                                       dg.mortar, dg)
+    end
+
+    # Prolong viscous fluxes to MPI interfaces
+    @trixi_timeit timer() "prolong2mpiinterfaces divergence" begin
+        prolong2mpiinterfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+    end
+
+    # Start divergence-stage MPI send
+    @trixi_timeit timer() "start MPI send divergence" begin
+        start_mpi_send!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # Local interface fluxes
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+    end
+
+    @trixi_timeit timer() "interface flux" begin
+        calc_interface_flux!(cache_p.elements.surface_flux_values, mesh,
+                             equations_parabolic, dg, parabolic_scheme, cache_p)
+    end
+
+    # Local boundary fluxes
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+    end
+
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux_divergence!(cache_p, t,
+                                       boundary_conditions_parabolic, mesh,
+                                       equations_parabolic,
+                                       dg.surface_integral, dg)
+    end
+
+    # Local mortar fluxes
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars_divergence!(cache_p, flux_viscous, mesh, equations_parabolic,
+                                    dg.mortar, dg)
+    end
+
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux_divergence!(cache_p.elements.surface_flux_values,
+                                     mesh, equations_parabolic, dg.mortar,
+                                     dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish divergence-stage MPI receive
+    @trixi_timeit timer() "finish MPI receive divergence" begin
+        finish_mpi_receive!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # MPI interface fluxes for divergence stage
+    @trixi_timeit timer() "MPI interface flux divergence" begin
+        calc_mpi_interface_flux_divergence!(cache_p.elements.surface_flux_values,
+                                            mesh, equations_parabolic,
+                                            dg, parabolic_scheme, cache_p)
+    end
+
+    # MPI mortar fluxes for divergence stage
+    @trixi_timeit timer() "MPI mortar flux divergence" begin
+        calc_mpi_mortar_flux_divergence!(cache_p.elements.surface_flux_values,
+                                         mesh, equations_parabolic, dg.mortar,
+                                         dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish divergence-stage MPI send
+    @trixi_timeit timer() "finish MPI send divergence" begin
+        finish_mpi_send!(cache_p.mpi_cache)
+    end
+
+    #
+    # Stage 4: final assembly
+    #
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral!(du, u, mesh, equations_parabolic,
+                               dg.surface_integral, dg, cache_p)
+    end
+
+    @trixi_timeit timer() "Jacobian" begin
+        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    @trixi_timeit timer() "source terms parabolic" begin
+        calc_sources_parabolic!(du, u, gradients, t, source_terms_parabolic,
+                                equations_parabolic, dg, cache_p)
+    end
+
+    return nothing
+end
+
+
+
+
+
+
+function calc_gradient!(gradients, u_transformed, t,
+                         mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                        equations_parabolic, boundary_conditions_parabolic,
+                        dg::DG, parabolic_scheme, cache)
+    # Reset gradients
+    @trixi_timeit timer() "reset gradients" begin
+        reset_gradients!(gradients, dg, cache)
+    end
+
+    # Calculate volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral_gradient!(gradients, u_transformed,
+                                       mesh, equations_parabolic, dg, cache)
+    end
+
+    # Prolong solution to interfaces.
+    # This reuses `prolong2interfaces` for the purely hyperbolic case.
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache, u_transformed, mesh,
+                            equations_parabolic, dg)
+    end
+
+    # Calculate interface fluxes for the gradients
+    @trixi_timeit timer() "interface flux" begin
+        @unpack surface_flux_values = cache.elements
+        calc_interface_flux_gradient!(surface_flux_values, mesh, equations_parabolic,
+                                      dg, parabolic_scheme, cache)
+    end
+
+    # Prolong solution to boundaries.
+    # This reuses `prolong2boundaries` for the purely hyperbolic case.
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache, u_transformed, mesh,
+                            equations_parabolic, dg)
+    end
+
+    # Calculate boundary fluxes
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux_gradient!(cache, t, boundary_conditions_parabolic,
+                                     mesh, equations_parabolic, dg.surface_integral,
+                                     dg)
+    end
+
+    # Prolong solution to mortars.
+    # This reuses `prolong2mortars` for the purely hyperbolic case.
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars!(cache, u_transformed, mesh, equations_parabolic,
+                         dg.mortar, dg)
+    end
+
+    # Calculate mortar fluxes
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux_gradient!(cache.elements.surface_flux_values,
+                                   mesh, equations_parabolic, dg.mortar,
+                                   dg, parabolic_scheme, cache)
+    end
+
+
+    # Calculate MPI interface fluxes
+    @trixi_timeit timer() "MPI interface flux" begin
+        @unpack surface_flux_values = cache.elements
+        calc_mpi_interface_flux_gradient!(surface_flux_values, mesh, equations_parabolic,
+                                      dg, parabolic_scheme, cache)
+    end
+
+    # Calculate MPI mortar fluxes
+    @trixi_timeit timer() "MPI mortar flux" begin
+        calc_mpi_mortar_flux_gradient!(cache.elements.surface_flux_values,
+                                   mesh, equations_parabolic, dg.mortar,
+                                   dg, parabolic_scheme, cache)
+    end
+
+    # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral_gradient!(gradients, mesh, equations_parabolic,
+                                        dg, cache)
+    end
+
+
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" begin
+        apply_jacobian_parabolic!(gradients, mesh, equations_parabolic, dg,
+                                  cache)
+    end
+
+    return nothing
+end
+
+
+
+# This version is called during `calc_gradients!` and must be specialized because the
+# MAGNITUDE of the flux does NOT depend on the outward normal.
+# Thus, you don't need to scale by 2 (e.g., the scaling factor in the normals (and in the
+# contravariant vectors) along large/small elements across a non-conforming
+# interface in 2D) and flip the sign when storing the mortar fluxes back
+# into `surface_flux_values`.
+@inline function mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                                     mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                                     equations_parabolic::AbstractEquationsParabolic,
+                                                     mortar_l2::LobattoLegendreMortarL2,
+                                                     dg::DGSEM, cache, mortar,
+                                                     fstar_primary,
+                                                     fstar_secondary, u_buffer,
+                                                     fstar_tmp)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    index_range = eachnode(dg)
+    # Copy solution small to small
+    small_indices = node_indices[1, mortar]
+    small_direction = indices2direction(small_indices)
+
+    for position in 1:4 # Loop over small elements
+        element = neighbor_ids[position, mortar]
+        for j in eachnode(dg), i in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                       i,
+                                                                                       j,
+                                                                                       position]
+            end
+        end
+    end
+
+    # Project small fluxes to large element.
+    multiply_dimensionwise!(u_buffer,
+                            mortar_l2.reverse_lower, mortar_l2.reverse_lower,
+                            view(fstar_secondary, .., 1), fstar_tmp)
+    add_multiply_dimensionwise!(u_buffer,
+                                mortar_l2.reverse_upper, mortar_l2.reverse_lower,
+                                view(fstar_secondary, .., 2), fstar_tmp)
+    add_multiply_dimensionwise!(u_buffer,
+                                mortar_l2.reverse_lower, mortar_l2.reverse_upper,
+                                view(fstar_secondary, .., 3), fstar_tmp)
+    add_multiply_dimensionwise!(u_buffer,
+                                mortar_l2.reverse_upper, mortar_l2.reverse_upper,
+                                view(fstar_secondary, .., 4), fstar_tmp)
+
+    # Copy interpolated flux values from buffer to large element face in the
+    # correct orientation.
+    # Note that the index of the small sides will always run forward but
+    # the index of the large side might need to run backwards for flipped sides.
+    large_element = neighbor_ids[5, mortar]
+    large_indices = node_indices[2, mortar]
+    large_direction = indices2direction(large_indices)
+    large_surface_indices = surface_indices(large_indices)
+
+    i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_surface_indices[1],
+                                                                           index_range)
+    j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_surface_indices[2],
+                                                                           index_range)
+
+    # Note that the indices of the small sides will always run forward but
+    # the large indices might need to run backwards for flipped sides.
+    i_large = i_large_start
+    j_large = j_large_start
+    for j in eachnode(dg)
+        for i in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                surface_flux_values[v, i_large, j_large, large_direction, large_element] = u_buffer[v,
+                                                                                                    i,
+                                                                                                    j]
+            end
+            i_large += i_large_step_i
+            j_large += j_large_step_i
+        end
+        i_large += i_large_step_j
+        j_large += j_large_step_j
+    end
+
+    return nothing
+end
+
+function calc_interface_flux_gradient!(surface_flux_values,
+                                      mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                       equations_parabolic,
+                                       dg::DG, parabolic_scheme, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get element and side information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+        primary_direction = indices2direction(primary_indices)
+
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        # Get element and side information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+        secondary_direction = indices2direction(secondary_indices)
+        secondary_surface_indices = surface_indices(secondary_indices)
+
+        # Get the surface indexing on the secondary element.
+        # Note that the indices of the primary side will always run forward but
+        # the secondary indices might need to run backwards for flipped sides.
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                           index_range)
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(primary_direction,
+                                                        contravariant_vectors,
+                                                        i_primary, j_primary, k_primary,
+                                                        primary_element)
+
+                calc_interface_flux_gradient!(surface_flux_values, mesh,
+                                              equations_parabolic,
+                                              dg, parabolic_scheme, cache,
+                                              interface, normal_direction, i, j,
+                                              primary_direction, primary_element,
+                                              i_secondary, j_secondary,
+                                              secondary_direction, secondary_element)
+
+                # Increment the primary element indices
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+                # Increment the secondary element surface indices
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+            end
+            # Increment the primary element indices
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+            # Increment the secondary element surface indices
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+        end
+    end
+
+    return nothing
+end
+
+# This version is used for parabolic gradient computations
+@inline function calc_interface_flux_gradient!(surface_flux_values, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                               equations_parabolic,
+                                               dg::DG, parabolic_scheme, cache,
+                                               interface_index, normal_direction,
+                                               primary_i_node_index,
+                                               primary_j_node_index,
+                                               primary_direction_index,
+                                               primary_element_index,
+                                               secondary_i_node_index,
+                                               secondary_j_node_index,
+                                               secondary_direction_index,
+                                               secondary_element_index)
+    @unpack u = cache.interfaces
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg,
+                                       primary_i_node_index, primary_j_node_index,
+                                       interface_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    for v in eachvariable(equations_parabolic)
+        surface_flux_values[v, primary_i_node_index, primary_j_node_index, primary_direction_index, primary_element_index] = flux_[v]
+        # No sign flip required for gradient calculation because for parabolic terms,
+        # the normals are not embedded in `flux_` for gradient computations.
+        surface_flux_values[v, secondary_i_node_index, secondary_j_node_index, secondary_direction_index, secondary_element_index] = flux_[v]
+    end
+
+    return nothing
+end
+
+
+
+function calc_mpi_interface_flux_gradient!(surface_flux_values,
+                                           mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                           equations_parabolic,
+                                           dg::DG, parabolic_scheme, cache)
+    @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachmpiinterface(dg, cache)
+        local_element = local_neighbor_ids[interface]
+        local_indices = node_indices[interface]
+        local_direction = indices2direction(local_indices)
+        local_side = local_sides[interface]
+
+        # Create the local i,j,k indexing on the local element used to pull normal direction information
+        i_element_start, i_element_step_i, i_element_step_j = index_to_start_step_3d(local_indices[1],
+                                                                                     index_range)
+        j_element_start, j_element_step_i, j_element_step_j = index_to_start_step_3d(local_indices[2],
+                                                                                     index_range)
+        k_element_start, k_element_step_i, k_element_step_j = index_to_start_step_3d(local_indices[3],
+                                                                                     index_range)
+
+        i_element = i_element_start
+        j_element = j_element_start
+        k_element = k_element_start
+
+        # Initiate the node indices to be used in the surface for loop,
+        # the surface flux storage must be indexed in alignment with the local element indexing
+        local_surface_indices = surface_indices(local_indices)
+        i_surface_start, i_surface_step_i, i_surface_step_j = index_to_start_step_3d(local_surface_indices[1],
+                                                                                     index_range)
+        j_surface_start, j_surface_step_i, j_surface_step_j = index_to_start_step_3d(local_surface_indices[2],
+                                                                                     index_range)
+        i_surface = i_surface_start
+        j_surface = j_surface_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(local_direction,
+                                                        contravariant_vectors,
+                                                        i_element, j_element, k_element,
+                                                        local_element)
+
+                calc_mpi_interface_flux_gradient!(surface_flux_values, mesh,
+                                  equations_parabolic,
+                                  dg, parabolic_scheme, cache,
+                                  interface, normal_direction,
+                                  i, j, local_side,
+                                  i_surface, j_surface,
+                                  local_direction, local_element)
+                # Increment local element indices to pull the normal direction
+                i_element += i_element_step_i
+                j_element += j_element_step_i
+                k_element += k_element_step_i
+                # Increment the surface node indices along the local element
+                i_surface += i_surface_step_i
+                j_surface += j_surface_step_i
+            end
+            # Increment local element indices to pull the normal direction
+            i_element += i_element_step_j
+            j_element += j_element_step_j
+            k_element += k_element_step_j
+            # Increment the surface node indices along the local element
+            i_surface += i_surface_step_j
+            j_surface += j_surface_step_j
+        end
+    end
+
+    return nothing
+end
+@inline function calc_mpi_interface_flux_gradient!(surface_flux_values,
+                                                   mesh::Union{P4estMeshParallel{3},
+                                                               T8codeMeshParallel{3}},
+                                                   equations_parabolic,
+                                                   dg::DG, parabolic_scheme, cache,
+                                                   interface_index, normal_direction,
+                                                   interface_i_node_index,
+                                                   interface_j_node_index, local_side,
+                                                   surface_i_node_index,
+                                                   surface_j_node_index,
+                                                   local_direction_index,
+                                                   local_element_index)
+    @unpack u = cache.mpi_interfaces
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg,
+                                       interface_i_node_index,
+                                       interface_j_node_index,
+                                       interface_index)
+
+    if local_side == 1
+        flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                               equations_parabolic, parabolic_scheme)
+    else # local_side == 2
+        flux_ = flux_parabolic(u_ll, u_rr, -normal_direction, Gradient(),
+                               equations_parabolic, parabolic_scheme)
+    end
+
+    for v in eachvariable(equations_parabolic)
+        surface_flux_values[v, surface_i_node_index, surface_j_node_index,
+                            local_direction_index, local_element_index] = flux_[v]
+    end
+
+    return nothing
+end
+
+
+# This is the version used when calculating the divergence of the viscous fluxes.
+# Identical to weak-form volume integral/kernel for the purely hyperbolic case,
+# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
+function calc_volume_integral!(du, flux_viscous, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                               equations_parabolic::AbstractEquationsParabolic,
+                               dg::DGSEM, cache)
+    (; derivative_hat) = dg.basis
+    (; contravariant_vectors) = cache.elements
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for element in eachelement(dg, cache)
+        # Calculate volume terms in one element
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            flux1 = get_node_vars(flux_viscous_x, equations_parabolic, dg,
+                                  i, j, k, element)
+            flux2 = get_node_vars(flux_viscous_y, equations_parabolic, dg,
+                                  i, j, k, element)
+            flux3 = get_node_vars(flux_viscous_z, equations_parabolic, dg,
+                                  i, j, k, element)
+
+            # Compute the contravariant flux by taking the scalar product of the
+            # first contravariant vector Ja^1 and the flux vector
+            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                        i, j, k, element)
+            contravariant_flux1 = Ja11 * flux1 + Ja12 * flux2 + Ja13 * flux3
+            for ii in eachnode(dg)
+                multiply_add_to_node_vars!(du, derivative_hat[ii, i],
+                                           contravariant_flux1,
+                                           equations_parabolic, dg, ii, j, k, element)
+            end
+
+            # Compute the contravariant flux by taking the scalar product of the
+            # second contravariant vector Ja^2 and the flux vector
+            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                        i, j, k, element)
+            contravariant_flux2 = Ja21 * flux1 + Ja22 * flux2 + Ja23 * flux3
+            for jj in eachnode(dg)
+                multiply_add_to_node_vars!(du, derivative_hat[jj, j],
+                                           contravariant_flux2,
+                                           equations_parabolic, dg, i, jj, k, element)
+            end
+
+            # Compute the contravariant flux by taking the scalar product of the
+            # second contravariant vector Ja^2 and the flux vector
+            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                        i, j, k, element)
+            contravariant_flux3 = Ja31 * flux1 + Ja32 * flux2 + Ja33 * flux3
+            for kk in eachnode(dg)
+                multiply_add_to_node_vars!(du, derivative_hat[kk, k],
+                                           contravariant_flux3,
+                                           equations_parabolic, dg, i, j, kk, element)
+            end
+        end
+    end
+
+    return nothing
+end
+
+# This is the version used when calculating the divergence of the viscous fluxes.
+# Specialization `flux_viscous::Tuple` needed to
+# avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_3d.jl
+# which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
+function prolong2interfaces!(cache, flux_viscous::Tuple,
+                             mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             dg::DG)
+    (; interfaces) = cache
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Copy solution data from the primary element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        # Note that in the current implementation, the interface will be
+        # "aligned at the primary element", i.e., the index of the primary side
+        # will always run forwards.
+        primary_element = interfaces.neighbor_ids[1, interface]
+        primary_indices = interfaces.node_indices[1, interface]
+        primary_direction = indices2direction(primary_indices)
+
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # this is the outward normal direction on the primary element
+                normal_direction = get_normal_direction(primary_direction,
+                                                        contravariant_vectors,
+                                                        i_primary, j_primary,
+                                                        k_primary,
+                                                        primary_element)
+
+                for v in eachvariable(equations_parabolic)
+                    # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
+                    flux_viscous = SVector(flux_viscous_x[v, i_primary, j_primary,
+                                                          k_primary, primary_element],
+                                           flux_viscous_y[v, i_primary, j_primary,
+                                                          k_primary, primary_element],
+                                           flux_viscous_z[v, i_primary, j_primary,
+                                                          k_primary, primary_element])
+
+                    interfaces.u[1, v, i, j, interface] = dot(flux_viscous,
+                                                              normal_direction)
+                end
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+            end
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+        end
+
+        # Copy solution data from the secondary element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        secondary_element = interfaces.neighbor_ids[2, interface]
+        secondary_indices = interfaces.node_indices[2, interface]
+        secondary_direction = indices2direction(secondary_indices)
+
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                           index_range)
+        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                           index_range)
+
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+        k_secondary = k_secondary_start
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # This is the outward normal direction on the secondary element.
+                # Here, we assume that normal_direction on the secondary element is
+                # the negative of normal_direction on the primary element.
+                normal_direction = get_normal_direction(secondary_direction,
+                                                        contravariant_vectors,
+                                                        i_secondary, j_secondary,
+                                                        k_secondary,
+                                                        secondary_element)
+
+                for v in eachvariable(equations_parabolic)
+                    # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
+                    flux_viscous = SVector(flux_viscous_x[v, i_secondary, j_secondary,
+                                                          k_secondary,
+                                                          secondary_element],
+                                           flux_viscous_y[v, i_secondary, j_secondary,
+                                                          k_secondary,
+                                                          secondary_element],
+                                           flux_viscous_z[v, i_secondary, j_secondary,
+                                                          k_secondary,
+                                                          secondary_element])
+                    # store the normal flux with respect to the primary normal direction,
+                    # which is the negative of the secondary normal direction
+                    interfaces.u[2, v, i, j, interface] = -dot(flux_viscous,
+                                                               normal_direction)
+                end
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+                k_secondary += k_secondary_step_i
+            end
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+            k_secondary += k_secondary_step_j
+        end
+    end
+
+    return nothing
+end
+
+# This version is used for divergence flux computations
+function calc_interface_flux!(surface_flux_values, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                              equations_parabolic, dg::DG, parabolic_scheme,
+                              cache)
+    (; neighbor_ids, node_indices) = cache.interfaces
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get element and side index information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+        primary_direction_index = indices2direction(primary_indices)
+
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        # Get element and side index information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+        secondary_direction_index = indices2direction(secondary_indices)
+        secondary_surface_indices = surface_indices(secondary_indices)
+
+        # Initiate the secondary index to be used in the surface for loop.
+        # This index on the primary side will always run forward but
+        # the secondary index might need to run backwards for flipped sides.
+        # Get the surface indexing on the secondary element.
+        # Note that the indices of the primary side will always run forward but
+        # the secondary indices might need to run backwards for flipped sides.
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                           index_range)
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(primary_direction_index,
+                                                        contravariant_vectors,
+                                                        i_primary, j_primary, k_primary,
+                                                        primary_element)
+                # We prolong the viscous flux dotted with respect the outward normal on the
+                # primary element.
+                viscous_flux_normal_ll, viscous_flux_normal_rr = get_surface_node_vars(cache.interfaces.u,
+                                                                                       equations_parabolic,
+                                                                                       dg,
+                                                                                       i,
+                                                                                       j,
+                                                                                       interface)
+
+                flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                                       normal_direction, Divergence(),
+                                       equations_parabolic, parabolic_scheme)
+
+                for v in eachvariable(equations_parabolic)
+                    surface_flux_values[v, i, j, primary_direction_index, primary_element] = flux_[v]
+                    # Sign flip required for divergence calculation since the flux for the 
+                    # divergence involves the normal direction.
+                    surface_flux_values[v, i_secondary, j_secondary, secondary_direction_index, secondary_element] = -flux_[v]
+                end
+
+                # Increment the primary element indices
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+                # Increment the secondary element surface indices
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+            end
+            # Increment the primary element indices
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+            # Increment the secondary element surface indices
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+        end
+    end
+
+    return nothing
+end
+
+function prolong2mortars_divergence!(cache, flux_viscous,
+                                     mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, equations_parabolic,
+                                     mortar_l2::LobattoLegendreMortarL2,
+                                     dg::DGSEM)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack fstar_tmp_threaded = cache
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Copy solution data from the small elements using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        small_indices = node_indices[1, mortar]
+        direction_index = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        for position in 1:4 # Loop over small elements
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+            element = neighbor_ids[position, mortar]
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction_index,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    for v in eachvariable(equations_parabolic)
+                        flux_viscous = SVector(flux_viscous_x[v, i_small, j_small,
+                                                              k_small, element],
+                                               flux_viscous_y[v, i_small, j_small,
+                                                              k_small, element],
+                                               flux_viscous_z[v, i_small, j_small,
+                                                              k_small, element])
+
+                        cache.mortars.u[1, v, position, i, j, mortar] = dot(flux_viscous,
+                                                                            normal_direction)
+                    end
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer to copy solution values of the large element in the correct orientation
+        # before interpolating
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # temporary buffer for projections
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Copy solution of large element face to buffer in the
+        # correct orientation
+        large_indices = node_indices[2, mortar]
+
+        i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_indices[1],
+                                                                               index_range)
+        j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_indices[2],
+                                                                               index_range)
+        k_large_start, k_large_step_i, k_large_step_j = index_to_start_step_3d(large_indices[3],
+                                                                               index_range)
+
+        i_large = i_large_start
+        j_large = j_large_start
+        k_large = k_large_start
+        element = neighbor_ids[5, mortar] # Large element
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(direction_index,
+                                                        contravariant_vectors,
+                                                        i_large, j_large, k_large,
+                                                        element)
+
+                for v in eachvariable(equations_parabolic)
+                    flux_viscous = SVector(flux_viscous_x[v, i_large, j_large, k_large,
+                                                          element],
+                                           flux_viscous_y[v, i_large, j_large, k_large,
+                                                          element],
+                                           flux_viscous_z[v, i_large, j_large, k_large,
+                                                          element])
+
+                    # We prolong the viscous flux dotted with respect the outward normal
+                    # on the small element. We scale by -1/2 here because the normal
+                    # direction on the large element is negative 2x that of the small
+                    # element (these normal directions are "scaled" by the surface Jacobian)
+                    u_buffer[v, i, j] = -0.5f0 * dot(flux_viscous, normal_direction)
+                end
+                i_large += i_large_step_i
+                j_large += j_large_step_i
+                k_large += k_large_step_i
+            end
+            i_large += i_large_step_j
+            j_large += j_large_step_j
+            k_large += k_large_step_j
+        end
+
+        # Interpolate large element face data from buffer to small face locations
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 1, :, :, mortar),
+                                mortar_l2.forward_lower,
+                                mortar_l2.forward_lower,
+                                u_buffer, fstar_tmp)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 2, :, :, mortar),
+                                mortar_l2.forward_upper,
+                                mortar_l2.forward_lower,
+                                u_buffer, fstar_tmp)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 3, :, :, mortar),
+                                mortar_l2.forward_lower,
+                                mortar_l2.forward_upper,
+                                u_buffer, fstar_tmp)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 4, :, :, mortar),
+                                mortar_l2.forward_upper,
+                                mortar_l2.forward_upper,
+                                u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+
+function prolong2mpimortars_divergence!(cache, flux_viscous,
+                                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                        equations_parabolic,
+                                        mortar_l2::LobattoLegendreMortarL2,
+                                        dg::DGSEM)
+    @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
+    @unpack fstar_tmp_threaded = cache
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for mortar in eachmpimortar(dg, cache)
+        small_indices = node_indices[1, mortar]
+        direction_index = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j =
+            index_to_start_step_3d(small_indices[1], index_range)
+        j_small_start, j_small_step_i, j_small_step_j =
+            index_to_start_step_3d(small_indices[2], index_range)
+        k_small_start, k_small_step_i, k_small_step_j =
+            index_to_start_step_3d(small_indices[3], index_range)
+
+        # Fill u[1, ...] from local small elements
+        for position in 1:4
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+
+            element = local_neighbor_ids[mortar][position]
+
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction_index,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    for v in eachvariable(equations_parabolic)
+                        flux_viscous_node = SVector(
+                            flux_viscous_x[v, i_small, j_small, k_small, element],
+                            flux_viscous_y[v, i_small, j_small, k_small, element],
+                            flux_viscous_z[v, i_small, j_small, k_small, element]
+                        )
+
+                        cache.mpi_mortars.u[1, v, position, i, j, mortar] =
+                            dot(flux_viscous_node, normal_direction)
+                    end
+
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer for large element face values before interpolation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Find the local large element on this MPI mortar, if it exists
+        large_pos = findfirst(==(5), local_neighbor_positions[mortar])
+
+        if large_pos !== nothing
+            large_indices = node_indices[2, mortar]
+
+            i_large_start, i_large_step_i, i_large_step_j =
+                index_to_start_step_3d(large_indices[1], index_range)
+            j_large_start, j_large_step_i, j_large_step_j =
+                index_to_start_step_3d(large_indices[2], index_range)
+            k_large_start, k_large_step_i, k_large_step_j =
+                index_to_start_step_3d(large_indices[3], index_range)
+
+            i_large = i_large_start
+            j_large = j_large_start
+            k_large = k_large_start
+
+            element = local_neighbor_ids[mortar][large_pos]
+
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction_index,
+                                                            contravariant_vectors,
+                                                            i_large, j_large, k_large,
+                                                            element)
+
+                    for v in eachvariable(equations_parabolic)
+                        flux_viscous_node = SVector(
+                            flux_viscous_x[v, i_large, j_large, k_large, element],
+                            flux_viscous_y[v, i_large, j_large, k_large, element],
+                            flux_viscous_z[v, i_large, j_large, k_large, element]
+                        )
+
+                        # same scaling/sign as baseline
+                        u_buffer[v, i, j] = -0.5f0 * dot(flux_viscous_node, normal_direction)
+                    end
+
+                    i_large += i_large_step_i
+                    j_large += j_large_step_i
+                    k_large += k_large_step_i
+                end
+                i_large += i_large_step_j
+                j_large += j_large_step_j
+                k_large += k_large_step_j
+            end
+
+            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 1, :, :, mortar),
+                                    mortar_l2.forward_lower,
+                                    mortar_l2.forward_lower,
+                                    u_buffer, fstar_tmp)
+            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 2, :, :, mortar),
+                                    mortar_l2.forward_upper,
+                                    mortar_l2.forward_lower,
+                                    u_buffer, fstar_tmp)
+            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 3, :, :, mortar),
+                                    mortar_l2.forward_lower,
+                                    mortar_l2.forward_upper,
+                                    u_buffer, fstar_tmp)
+            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 4, :, :, mortar),
+                                    mortar_l2.forward_upper,
+                                    mortar_l2.forward_upper,
+                                    u_buffer, fstar_tmp)
+        end
+    end
+
+    return nothing
+end
+
+function calc_mortar_flux_divergence!(surface_flux_values,
+                                      mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                      equations_parabolic,
+                                      mortar_l2::LobattoLegendreMortarL2,
+                                      dg::DG, parabolic_scheme, cache)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack contravariant_vectors = cache.elements
+    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
+    index_range = eachnode(dg)
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar = fstar_primary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Get index information on the small elements
+        small_indices = node_indices[1, mortar]
+        small_direction = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        for position in 1:4 # Loop over small elements
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+            element = neighbor_ids[position, mortar]
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(small_direction,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    for v in eachvariable(equations_parabolic)
+                        viscous_flux_normal_ll = cache.mortars.u[1, v, position, i, j,
+                                                                 mortar]
+                        viscous_flux_normal_rr = cache.mortars.u[2, v, position, i, j,
+                                                                 mortar]
+
+                        flux_ = flux_parabolic(viscous_flux_normal_ll,
+                                               viscous_flux_normal_rr,
+                                               normal_direction, Divergence(),
+                                               equations_parabolic, parabolic_scheme)
+
+                        # Sign flip (and scaling by 0.5) already handled above in `prolong2mortars_divergence!`
+                        fstar[v, i, j, position] = flux_
+                    end
+
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer to interpolate flux values of the large element to before
+        # copying in the correct orientation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # this reuses the hyperbolic version of `mortar_fluxes_to_elements!`
+        mortar_fluxes_to_elements!(surface_flux_values, mesh,
+                                   equations_parabolic, mortar_l2, dg, cache,
+                                   mortar, fstar, fstar, u_buffer,
+                                   fstar_tmp)
+    end
+
+    return nothing
+end
+
+
+function calc_mpi_interface_flux_divergence!(surface_flux_values,
+                                             mesh::Union{P4estMeshParallel{3},
+                                                         T8codeMeshParallel{3}},
+                                             equations_parabolic,
+                                             dg::DG, parabolic_scheme, cache)
+    @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachmpiinterface(dg, cache)
+        local_element = local_neighbor_ids[interface]
+        local_indices = node_indices[interface]
+        local_direction = indices2direction(local_indices)
+        local_side = local_sides[interface]
+
+        i_element_start, i_element_step_i, i_element_step_j =
+            index_to_start_step_3d(local_indices[1], index_range)
+        j_element_start, j_element_step_i, j_element_step_j =
+            index_to_start_step_3d(local_indices[2], index_range)
+        k_element_start, k_element_step_i, k_element_step_j =
+            index_to_start_step_3d(local_indices[3], index_range)
+
+        i_element = i_element_start
+        j_element = j_element_start
+        k_element = k_element_start
+
+        local_surface_indices = surface_indices(local_indices)
+        i_surface_start, i_surface_step_i, i_surface_step_j =
+            index_to_start_step_3d(local_surface_indices[1], index_range)
+        j_surface_start, j_surface_step_i, j_surface_step_j =
+            index_to_start_step_3d(local_surface_indices[2], index_range)
+
+        i_surface = i_surface_start
+        j_surface = j_surface_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(local_direction,
+                                                        contravariant_vectors,
+                                                        i_element, j_element, k_element,
+                                                        local_element)
+
+                calc_mpi_interface_flux_divergence!(surface_flux_values, mesh,
+                                                    equations_parabolic,
+                                                    dg, parabolic_scheme, cache,
+                                                    interface, normal_direction,
+                                                    i, j, local_side,
+                                                    i_surface, j_surface,
+                                                    local_direction, local_element)
+
+                i_element += i_element_step_i
+                j_element += j_element_step_i
+                k_element += k_element_step_i
+
+                i_surface += i_surface_step_i
+                j_surface += j_surface_step_i
+            end
+
+            i_element += i_element_step_j
+            j_element += j_element_step_j
+            k_element += k_element_step_j
+
+            i_surface += i_surface_step_j
+            j_surface += j_surface_step_j
+        end
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_interface_flux_divergence!(surface_flux_values,
+                                                     mesh::Union{P4estMeshParallel{3},
+                                                                 T8codeMeshParallel{3}},
+                                                     equations_parabolic,
+                                                     dg::DG, parabolic_scheme, cache,
+                                                     interface_index, normal_direction,
+                                                     interface_i_node_index,
+                                                     interface_j_node_index, local_side,
+                                                     surface_i_node_index,
+                                                     surface_j_node_index,
+                                                     local_direction_index,
+                                                     local_element_index)
+    @unpack u = cache.mpi_interfaces
+
+    viscous_flux_normal_ll, viscous_flux_normal_rr =
+        get_surface_node_vars(u, equations_parabolic, dg,
+                              interface_i_node_index, interface_j_node_index,
+                              interface_index)
+
+    if local_side == 1
+        flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                               normal_direction, Divergence(),
+                               equations_parabolic, parabolic_scheme)
+    else
+        flux_ = -flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                                -normal_direction, Divergence(),
+                                equations_parabolic, parabolic_scheme)
+    end
+
+    for v in eachvariable(equations_parabolic)
+        surface_flux_values[v, surface_i_node_index, surface_j_node_index,
+                            local_direction_index, local_element_index] = flux_[v]
+    end
+
+    return nothing
+end
+
+function calc_mpi_mortar_flux_divergence!(surface_flux_values,
+                                          mesh::Union{P4estMeshParallel{3},
+                                                      T8codeMeshParallel{3}},
+                                          equations_parabolic,
+                                          mortar_l2::LobattoLegendreMortarL2,
+                                          dg::DG, parabolic_scheme, cache)
+    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
+
+    @threaded for mortar in eachmpimortar(dg, cache)
+        fstar = fstar_primary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        for position in 1:4
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(cache.mpi_mortars, i, j,
+                                                            position, mortar)
+
+                    calc_mpi_mortar_flux_divergence!(fstar, mesh,
+                                                     equations_parabolic,
+                                                     dg, parabolic_scheme, cache,
+                                                     mortar, position,
+                                                     normal_direction, i, j)
+                end
+            end
+        end
+
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        mpi_mortar_fluxes_to_elements_divergence!(surface_flux_values, mesh,
+                                                  equations_parabolic, mortar_l2,
+                                                  dg, cache,
+                                                  mortar, fstar, u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_mortar_flux_divergence!(fstar,
+                                                  mesh::Union{P4estMeshParallel{3},
+                                                              T8codeMeshParallel{3}},
+                                                  equations_parabolic,
+                                                  dg::DG, parabolic_scheme, cache,
+                                                  mortar_index, position_index,
+                                                  normal_direction,
+                                                  i_node_index, j_node_index)
+    @unpack u = cache.mpi_mortars
+
+    for v in eachvariable(equations_parabolic)
+        viscous_flux_normal_ll = u[1, v, position_index, i_node_index, j_node_index, mortar_index]
+        viscous_flux_normal_rr = u[2, v, position_index, i_node_index, j_node_index, mortar_index]
+
+        flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                               normal_direction, Divergence(),
+                               equations_parabolic, parabolic_scheme)
+
+        fstar[v, i_node_index, j_node_index, position_index] = flux_
+    end
+
+    return nothing
+end
+
+@inline function mpi_mortar_fluxes_to_elements_divergence!(surface_flux_values,
+                                                           mesh::Union{P4estMeshParallel{3},
+                                                                       T8codeMeshParallel{3}},
+                                                           equations_parabolic,
+                                                           mortar_l2::LobattoLegendreMortarL2,
+                                                           dg::DGSEM, cache, mortar,
+                                                           fstar, u_buffer, fstar_tmp)
+    @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
+
+    mortar_fluxes_to_elements!(surface_flux_values, mesh,
+                               equations_parabolic, mortar_l2, dg, cache,
+                               mortar, fstar, fstar, u_buffer, fstar_tmp)
+
+    return nothing
+end
+
+function calc_mortar_flux_gradient!(surface_flux_values,
+                                    mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, equations_parabolic,
+                                    mortar_l2::LobattoLegendreMortarL2,
+                                    dg::DG, parabolic_scheme, cache)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack contravariant_vectors = cache.elements
+    @unpack fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded = cache
+    index_range = eachnode(dg)
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar_primary = fstar_primary_threaded[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Get index information on the small elements
+        small_indices = node_indices[1, mortar]
+        small_direction = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        for position in 1:4
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+            element = neighbor_ids[position, mortar]
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(small_direction,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                               mesh, equations_parabolic,
+                                               dg, parabolic_scheme, cache,
+                                               mortar, position, normal_direction,
+                                               i, j)
+
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer to interpolate flux values of the large element to before
+        # copying in the correct orientation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # in calc_interface_flux!, the interface flux is computed once over each
+        # interface using the normal from the "primary" element. The result is then
+        # passed back to the "secondary" element, flipping the sign to account for the
+        # change in the normal direction. For mortars, this sign flip occurs in
+        # "mortar_fluxes_to_elements_gradient!" instead.
+        mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                            mesh, equations_parabolic, mortar_l2,
+                                            dg, cache,
+                                            mortar, fstar_primary, fstar_secondary,
+                                            u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+# We structure `calc_mortar_flux_gradient!` similarly to "calc_mortar_flux!" for
+# hyperbolic equations with no nonconservative terms.
+# The reasoning is that parabolic fluxes are treated like conservative
+# terms (e.g., we compute a viscous conservative "flux") and thus no
+# non-conservative terms are present.
+@inline function calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                            mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                            equations_parabolic,
+                                            dg::DG, parabolic_scheme, cache,
+                                            mortar_index, position_index,
+                                            normal_direction,
+                                            i_node_index, j_node_index)
+    @unpack u = cache.mortars
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg, position_index,
+                                       i_node_index, j_node_index, mortar_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    # Copy flux to buffer
+    set_node_vars!(fstar_primary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+    # As in `calc_interface_flux_gradient!`, no sign flip is necessary for the fluxes used in gradient calculations
+    set_node_vars!(fstar_secondary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+
+    return nothing
+end
+
+
+function calc_mpi_mortar_flux_gradient!(surface_flux_values,
+                                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                        equations_parabolic,
+                                        mortar_l2::LobattoLegendreMortarL2,
+                                        dg::DG, parabolic_scheme, cache)
+    @unpack fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded = cache
+
+    @threaded for mortar in eachmpimortar(dg, cache)
+        fstar_primary = fstar_primary_threaded[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        for position in 1:4
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(cache.mpi_mortars, i, j,
+                                                            position, mortar)
+
+                    calc_mpi_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                                   mesh, equations_parabolic,
+                                                   dg, parabolic_scheme, cache,
+                                                   mortar, position,
+                                                   normal_direction, i, j)
+                end
+            end
+        end
+
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        mpi_mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                                mesh, equations_parabolic, mortar_l2,
+                                                dg, cache,
+                                                mortar, fstar_primary, fstar_secondary,
+                                                u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                                mesh::Union{P4estMeshParallel{3},
+                                                            T8codeMeshParallel{3}},
+                                                equations_parabolic,
+                                                dg::DG, parabolic_scheme, cache,
+                                                mortar_index, position_index,
+                                                normal_direction,
+                                                i_node_index, j_node_index)
+    @unpack u = cache.mpi_mortars
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg, position_index,
+                                       i_node_index, j_node_index, mortar_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    set_node_vars!(fstar_primary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+    set_node_vars!(fstar_secondary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+
+    return nothing
+end
+
+@inline function mpi_mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                                         mesh::Union{P4estMeshParallel{3},
+                                                                     T8codeMeshParallel{3}},
+                                                         equations_parabolic,
+                                                         mortar_l2::LobattoLegendreMortarL2,
+                                                         dg::DGSEM, cache, mortar,
+                                                         fstar_primary, fstar_secondary,
+                                                         u_buffer, fstar_tmp)
+    @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
+    index_range = eachnode(dg)
+
+    small_indices = node_indices[1, mortar]
+    small_direction = indices2direction(small_indices)
+    large_indices = node_indices[2, mortar]
+    large_direction = indices2direction(large_indices)
+    large_surface_indices = surface_indices(large_indices)
+
+    i_large_start, i_large_step_i, i_large_step_j =
+        index_to_start_step_3d(large_surface_indices[1], index_range)
+    j_large_start, j_large_step_i, j_large_step_j =
+        index_to_start_step_3d(large_surface_indices[2], index_range)
+
+    for (element, position) in zip(local_neighbor_ids[mortar],
+                                   local_neighbor_positions[mortar])
+        if position == 5
+            multiply_dimensionwise!(u_buffer,
+                                    mortar_l2.reverse_lower, mortar_l2.reverse_lower,
+                                    view(fstar_secondary, .., 1),
+                                    fstar_tmp)
+            add_multiply_dimensionwise!(u_buffer,
+                                        mortar_l2.reverse_upper,
+                                        mortar_l2.reverse_lower,
+                                        view(fstar_secondary, .., 2),
+                                        fstar_tmp)
+            add_multiply_dimensionwise!(u_buffer,
+                                        mortar_l2.reverse_lower,
+                                        mortar_l2.reverse_upper,
+                                        view(fstar_secondary, .., 3),
+                                        fstar_tmp)
+            add_multiply_dimensionwise!(u_buffer,
+                                        mortar_l2.reverse_upper,
+                                        mortar_l2.reverse_upper,
+                                        view(fstar_secondary, .., 4),
+                                        fstar_tmp)
+
+            # same sign/scale handling as local mortar_fluxes_to_elements_gradient!
+            u_buffer .*= -4
+
+            i_large = i_large_start
+            j_large = j_large_start
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        surface_flux_values[v, i_large, j_large,
+                                            large_direction, element] = u_buffer[v, i, j]
+                    end
+                    i_large += i_large_step_i
+                    j_large += j_large_step_i
+                end
+                i_large += i_large_step_j
+                j_large += j_large_step_j
+            end
+        else
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        surface_flux_values[v, i, j, small_direction, element] =
+                            fstar_primary[v, i, j, position]
+                    end
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+
+
+
+
+function calc_volume_integral_gradient!(gradients, u_transformed,
+                                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, # for dispatch only
+                                        equations_parabolic::AbstractEquationsParabolic,
+                                        dg::DG, cache)
+    @unpack derivative_hat = dg.basis
+    @unpack contravariant_vectors = cache.elements
+    gradients_x, gradients_y, gradients_z = gradients
+
+    @threaded for element in eachelement(dg, cache)
+        # Calculate volume terms in one element,
+        # corresponds to `kernel` functions for the hyperbolic part of the flux
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            u_node = get_node_vars(u_transformed, equations_parabolic, dg,
+                                   i, j, k, element)
+
+            for ii in eachnode(dg)
+                multiply_add_to_node_vars!(gradients_x, derivative_hat[ii, i],
+                                           u_node, equations_parabolic, dg,
+                                           ii, j, k, element)
+            end
+
+            for jj in eachnode(dg)
+                multiply_add_to_node_vars!(gradients_y, derivative_hat[jj, j],
+                                           u_node, equations_parabolic, dg,
+                                           i, jj, k, element)
+            end
+
+            for kk in eachnode(dg)
+                multiply_add_to_node_vars!(gradients_z, derivative_hat[kk, k],
+                                           u_node, equations_parabolic, dg,
+                                           i, j, kk, element)
+            end
+        end
+
+        # now that the reference coordinate gradients are computed, transform them node-by-node to physical gradients
+        # using the contravariant vectors
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                        i, j, k, element)
+            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                        i, j, k, element)
+            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                        i, j, k, element)
+
+            gradients_reference_1 = get_node_vars(gradients_x, equations_parabolic, dg,
+                                                  i, j, k, element)
+            gradients_reference_2 = get_node_vars(gradients_y, equations_parabolic, dg,
+                                                  i, j, k, element)
+            gradients_reference_3 = get_node_vars(gradients_z, equations_parabolic, dg,
+                                                  i, j, k, element)
+
+            # note that the contravariant vectors are transposed compared with computations of flux
+            # divergences in `calc_volume_integral!`. See
+            # https://github.com/trixi-framework/Trixi.jl/pull/1490#discussion_r1213345190
+            # for a more detailed discussion.
+            gradient_x_node = Ja11 * gradients_reference_1 +
+                              Ja21 * gradients_reference_2 +
+                              Ja31 * gradients_reference_3
+            gradient_y_node = Ja12 * gradients_reference_1 +
+                              Ja22 * gradients_reference_2 +
+                              Ja32 * gradients_reference_3
+            gradient_z_node = Ja13 * gradients_reference_1 +
+                              Ja23 * gradients_reference_2 +
+                              Ja33 * gradients_reference_3
+
+            set_node_vars!(gradients_x, gradient_x_node, equations_parabolic, dg,
+                           i, j, k, element)
+            set_node_vars!(gradients_y, gradient_y_node, equations_parabolic, dg,
+                           i, j, k, element)
+            set_node_vars!(gradients_z, gradient_z_node, equations_parabolic, dg,
+                           i, j, k, element)
+        end
+    end
+
+    return nothing
+end
+
+# Specialization `flux_viscous::Tuple` needed to
+# avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_3d.jl
+# which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
+function prolong2boundaries!(cache, flux_viscous::Tuple,
+                             mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             dg::DG)
+    (; boundaries) = cache
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for boundary in eachboundary(dg, cache)
+        # Copy solution data from the element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+        direction = indices2direction(node_indices)
+
+        i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                            index_range)
+        j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                            index_range)
+        k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                            index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        k_node = k_node_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # this is the outward normal direction on the primary element
+                normal_direction = get_normal_direction(direction,
+                                                        contravariant_vectors,
+                                                        i_node, j_node, k_node, element)
+
+                for v in eachvariable(equations_parabolic)
+                    flux_viscous = SVector(flux_viscous_x[v, i_node, j_node, k_node,
+                                                          element],
+                                           flux_viscous_y[v, i_node, j_node, k_node,
+                                                          element],
+                                           flux_viscous_z[v, i_node, j_node, k_node,
+                                                          element])
+
+                    boundaries.u[v, i, j, boundary] = dot(flux_viscous,
+                                                          normal_direction)
+                end
+                i_node += i_node_step_i
+                j_node += j_node_step_i
+                k_node += k_node_step_i
+            end
+            i_node += i_node_step_j
+            j_node += j_node_step_j
+            k_node += k_node_step_j
+        end
+    end
+    return nothing
+end
+
+function calc_boundary_flux!(cache, t,
+                             boundary_condition_parabolic, # works with Dict types
+                             boundary_condition_indices,
+                             operator_type, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             surface_integral, dg::DG)
+    (; boundaries) = cache
+    (; node_coordinates, surface_flux_values) = cache.elements
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for local_index in eachindex(boundary_condition_indices)
+        # Use the local index to get the global boundary index from the pre-sorted list
+        boundary_index = boundary_condition_indices[local_index]
+
+        # Get information on the adjacent element, compute the surface fluxes,
+        # and store them
+        element = boundaries.neighbor_ids[boundary_index]
+        node_indices = boundaries.node_indices[boundary_index]
+        direction_index = indices2direction(node_indices)
+
+        i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                            index_range)
+        j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                            index_range)
+        k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                            index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        k_node = k_node_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # Extract solution data from boundary container
+                u_inner = get_node_vars(boundaries.u, equations_parabolic, dg, i, j,
+                                        boundary_index)
+
+                # Outward-pointing normal direction (not normalized)
+                normal_direction = get_normal_direction(direction_index,
+                                                        contravariant_vectors,
+                                                        i_node, j_node, k_node, element)
+
+                # TODO: revisit if we want more general boundary treatments.
+                # This assumes the gradient numerical flux at the boundary is the gradient variable,
+                # which is consistent with BR1, LDG.
+                flux_inner = u_inner
+
+                # Coordinates at boundary node
+                x = get_node_coords(node_coordinates, equations_parabolic, dg,
+                                    i_node, j_node, k_node, element)
+
+                flux_ = boundary_condition_parabolic(flux_inner, u_inner,
+                                                     normal_direction,
+                                                     x, t, operator_type,
+                                                     equations_parabolic)
+
+                # Copy flux to element storage in the correct orientation
+                for v in eachvariable(equations_parabolic)
+                    surface_flux_values[v, i, j, direction_index, element] = flux_[v]
+                end
+
+                i_node += i_node_step_i
+                j_node += j_node_step_i
+                k_node += k_node_step_i
+            end
+            i_node += i_node_step_j
+            j_node += j_node_step_j
+            k_node += k_node_step_j
+        end
+    end
+
+    return nothing
+end
+
+function calc_surface_integral_gradient!(gradients,
+                                         mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, # for dispatch only
+                                         equations_parabolic::AbstractEquationsParabolic,
+                                         dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+    @unpack contravariant_vectors = cache.elements
+
+    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
+    # into FMAs (see comment at the top of the file).
+    factor = inverse_weights[1] # For LGL basis: Identical to weighted boundary interpolation at x = ±1
+    @threaded for element in eachelement(dg, cache)
+        for l in eachnode(dg), m in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                for dim in 1:3
+                    grad = gradients[dim]
+                    # surface at -x
+                    normal_direction = get_normal_direction(1, contravariant_vectors,
+                                                            1, l, m,
+                                                            element)
+                    grad[v, 1, l, m, element] = (grad[v,
+                                                      1, l, m,
+                                                      element] +
+                                                 surface_flux_values[v,
+                                                                     l, m, 1,
+                                                                     element] *
+                                                 factor *
+                                                 normal_direction[dim])
+
+                    # surface at +x
+                    normal_direction = get_normal_direction(2, contravariant_vectors,
+                                                            nnodes(dg), l, m,
+                                                            element)
+                    grad[v, nnodes(dg), l, m, element] = (grad[v,
+                                                               nnodes(dg), l, m,
+                                                               element] +
+                                                          surface_flux_values[v,
+                                                                              l, m, 2,
+                                                                              element] *
+                                                          factor *
+                                                          normal_direction[dim])
+
+                    # surface at -y
+                    normal_direction = get_normal_direction(3, contravariant_vectors,
+                                                            l, 1, m,
+                                                            element)
+                    grad[v, l, 1, m, element] = (grad[v,
+                                                      l, 1, m,
+                                                      element] +
+                                                 surface_flux_values[v,
+                                                                     l, m, 3,
+                                                                     element] *
+                                                 factor *
+                                                 normal_direction[dim])
+
+                    # surface at +y
+                    normal_direction = get_normal_direction(4, contravariant_vectors,
+                                                            l, nnodes(dg), m,
+                                                            element)
+                    grad[v, l, nnodes(dg), m, element] = (grad[v,
+                                                               l, nnodes(dg), m,
+                                                               element] +
+                                                          surface_flux_values[v,
+                                                                              l, m, 4,
+                                                                              element] *
+                                                          factor *
+                                                          normal_direction[dim])
+
+                    # surface at -z
+                    normal_direction = get_normal_direction(5, contravariant_vectors,
+                                                            l, m, 1,
+                                                            element)
+                    grad[v, l, m, 1, element] = (grad[v,
+                                                      l, m, 1,
+                                                      element] +
+                                                 surface_flux_values[v,
+                                                                     l, m, 5,
+                                                                     element] *
+                                                 factor *
+                                                 normal_direction[dim])
+
+                    # surface at +z
+                    normal_direction = get_normal_direction(6, contravariant_vectors,
+                                                            l, m, nnodes(dg),
+                                                            element)
+                    grad[v, l, m, nnodes(dg), element] = (grad[v,
+                                                               l, m, nnodes(dg),
+                                                               element] +
+                                                          surface_flux_values[v,
+                                                                              l, m, 6,
+                                                                              element] *
+                                                          factor *
+                                                          normal_direction[dim])
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+
+
+function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
+                                mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                equations_parabolic, dg::DG)
+    @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for interface in eachmpiinterface(dg, cache)
+        local_element = local_neighbor_ids[interface]
+        local_indices = node_indices[interface]
+        local_direction = indices2direction(local_indices)
+        local_side = local_sides[interface]
+
+        i_start, i_step_i, i_step_j = index_to_start_step_3d(local_indices[1], index_range)
+        j_start, j_step_i, j_step_j = index_to_start_step_3d(local_indices[2], index_range)
+        k_start, k_step_i, k_step_j = index_to_start_step_3d(local_indices[3], index_range)
+
+        i_elem = i_start
+        j_elem = j_start
+        k_elem = k_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(local_direction,
+                                                        contravariant_vectors,
+                                                        i_elem, j_elem, k_elem,
+                                                        local_element)
+
+                for v in eachvariable(equations_parabolic)
+                    flux_node = SVector(
+                        flux_viscous_x[v, i_elem, j_elem, k_elem, local_element],
+                        flux_viscous_y[v, i_elem, j_elem, k_elem, local_element],
+                        flux_viscous_z[v, i_elem, j_elem, k_elem, local_element]
+                    )
+
+                    cache.mpi_interfaces.u[local_side, v, i, j, interface] =
+                        dot(flux_node, normal_direction)
+                end
+
+                i_elem += i_step_i
+                j_elem += j_step_i
+                k_elem += k_step_i
+            end
+
+            i_elem += i_step_j
+            j_elem += j_step_j
+            k_elem += k_step_j
+        end
+    end
+
+    return nothing
+end
+
+
+# Needed to *not* flip the sign of the inverse Jacobian.
+# This is because the parabolic fluxes are assumed to be of the form
+#   `du/dt + df/dx = dg/dx + source(x,t)`,
+# where f(u) is the inviscid flux and g(u) is the viscous flux.
+function apply_jacobian_parabolic!(du::AbstractArray, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                   equations_parabolic::AbstractEquationsParabolic,
+                                   dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            factor = inverse_jacobian[i, j, k, element]
+
+            for v in eachvariable(equations_parabolic)
+                du[v, i, j, k, element] *= factor
+            end
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index e7f97d96933..636292bc41a 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -264,6 +264,29 @@ function create_cache(mesh::P4estMeshParallel, equations::AbstractEquations, dg:
     return cache
 end
 
+function create_cache_parabolic(mesh::P4estMeshParallel,
+                                equations_hyperbolic::AbstractEquations,
+                                dg::DG, n_elements, uEltype ) 
+
+    # Keep mesh state consistent, just like the hyperbolic parallel cache
+    balance!(mesh)
+    partition!(mesh)
+    update_ghost_layer!(mesh)
+
+    # Local parabolic containers
+    elements   = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
+    interfaces = init_interfaces(mesh, equations_hyperbolic, dg.basis, elements)
+    boundaries = init_boundaries(mesh, equations_hyperbolic, dg.basis, elements)
+
+
+    # --- Minimal viscous container (ONLY what is already used downstream)
+    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
+                                                  nnodes(dg), n_elements,
+                                                  uEltype)
+
+    return (; elements, interfaces, boundaries, viscous_container)
+end
+
 function init_mpi_cache(mesh::P4estMeshParallel, mpi_interfaces, mpi_mortars, nvars,
                         nnodes, uEltype)
     mpi_cache = P4estMPICache(uEltype)
diff --git a/src/solvers/dgsem_tree/dg_3d_parabolic.jl b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
index 79b9aaddcec..ab334ebe53d 100644
--- a/src/solvers/dgsem_tree/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
@@ -8,7 +8,7 @@
 # This method is called when a `SemidiscretizationHyperbolicParabolic` is constructed.
 # It constructs the basic `cache` used throughout the simulation to compute
 # the RHS etc.
-function create_cache_parabolic(mesh::Union{TreeMesh{3}, P4estMesh{3}},
+function create_cache_parabolic(mesh::Union{TreeMesh{3}},
                                 equations_hyperbolic::AbstractEquations,
                                 dg::DG, n_elements, uEltype)
     viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),

From 8837f5465893ea18d543e6a668b59953ea276cd4 Mon Sep 17 00:00:00 2001
From: Jeremy <jkarpowski@Jeremys-MacBook-Pro.local>
Date: Fri, 20 Mar 2026 14:40:25 +0100
Subject: [PATCH 2/7] fixed orientation error

---
 .../dgsem_p4est/dg_3d_parabolic_parallel.jl   | 67 +++++++------------
 1 file changed, 23 insertions(+), 44 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
index 7447d67a6ae..b2a979c212f 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
@@ -20,7 +20,6 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
                 interfaces = cache_parabolic.interfaces,
                 boundaries = cache_parabolic.boundaries)
 
-    #
     # Stage 0: local variable transform
     #
     @trixi_timeit timer() "transform variables" begin
@@ -65,7 +64,10 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
     @trixi_timeit timer() "finish MPI receive gradient" begin
         finish_mpi_receive!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
     end
-
+    # Finish gradient-stage MPI send
+    @trixi_timeit timer() "finish MPI send gradient" begin
+        finish_mpi_send!(cache_p.mpi_cache)
+    end
     # MPI interface fluxes for gradient stage
     @trixi_timeit timer() "MPI interface flux gradient" begin
         calc_mpi_interface_flux_gradient!(cache_p.elements.surface_flux_values,
@@ -73,19 +75,24 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
                                           dg, parabolic_scheme, cache_p)
     end
 
-    # MPI mortar fluxes for gradient stage
-    @trixi_timeit timer() "MPI mortar flux gradient" begin
-        calc_mpi_mortar_flux_gradient!(cache_p.elements.surface_flux_values,
-                                       mesh, equations_parabolic, dg.mortar,
-                                       dg, parabolic_scheme, cache_p)
-    end
+        # MPI mortar fluxes for gradient stage
+        @trixi_timeit timer() "MPI mortar flux gradient" begin
+            calc_mpi_mortar_flux_gradient!(cache_p.elements.surface_flux_values,
+                                        mesh, equations_parabolic, dg.mortar,
+                                        dg, parabolic_scheme, cache_p)
+        end
 
-    # Finish gradient-stage MPI send
-    @trixi_timeit timer() "finish MPI send gradient" begin
-        finish_mpi_send!(cache_p.mpi_cache)
+        # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral_gradient!(gradients, mesh, equations_parabolic,
+                                        dg, cache_p)
     end
 
-    #
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" begin
+        apply_jacobian_parabolic!(gradients, mesh, equations_parabolic, dg,
+                                  cache_p)
+    end
     # Stage 2: local viscous flux construction
     #
     @trixi_timeit timer() "calculate viscous fluxes" begin
@@ -122,7 +129,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
     @trixi_timeit timer() "prolong2mpiinterfaces divergence" begin
         prolong2mpiinterfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
     end
-
+    ########################## Divergence #################################
     # Start divergence-stage MPI send
     @trixi_timeit timer() "start MPI send divergence" begin
         start_mpi_send!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
@@ -268,34 +275,6 @@ function calc_gradient!(gradients, u_transformed, t,
                                    dg, parabolic_scheme, cache)
     end
 
-
-    # Calculate MPI interface fluxes
-    @trixi_timeit timer() "MPI interface flux" begin
-        @unpack surface_flux_values = cache.elements
-        calc_mpi_interface_flux_gradient!(surface_flux_values, mesh, equations_parabolic,
-                                      dg, parabolic_scheme, cache)
-    end
-
-    # Calculate MPI mortar fluxes
-    @trixi_timeit timer() "MPI mortar flux" begin
-        calc_mpi_mortar_flux_gradient!(cache.elements.surface_flux_values,
-                                   mesh, equations_parabolic, dg.mortar,
-                                   dg, parabolic_scheme, cache)
-    end
-
-    # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral_gradient!(gradients, mesh, equations_parabolic,
-                                        dg, cache)
-    end
-
-
-    # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" begin
-        apply_jacobian_parabolic!(gradients, mesh, equations_parabolic, dg,
-                                  cache)
-    end
-
     return nothing
 end
 
@@ -1607,7 +1586,7 @@ end
                                         fstar_tmp)
 
             # same sign/scale handling as local mortar_fluxes_to_elements_gradient!
-            u_buffer .*= -4
+            #u_buffer .*= -4
 
             i_large = i_large_start
             j_large = j_large_start
@@ -1972,6 +1951,7 @@ function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
         local_indices = node_indices[interface]
         local_direction = indices2direction(local_indices)
         local_side = local_sides[interface]
+        orientationFactor = local_side==1 ? 1 : -1
 
         i_start, i_step_i, i_step_j = index_to_start_step_3d(local_indices[1], index_range)
         j_start, j_step_i, j_step_j = index_to_start_step_3d(local_indices[2], index_range)
@@ -1995,8 +1975,7 @@ function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
                         flux_viscous_z[v, i_elem, j_elem, k_elem, local_element]
                     )
 
-                    cache.mpi_interfaces.u[local_side, v, i, j, interface] =
-                        dot(flux_node, normal_direction)
+                    cache.mpi_interfaces.u[local_side, v, i, j, interface] = orientationFactor .* dot(flux_node, normal_direction)
                 end
 
                 i_elem += i_step_i

From a89066baa604b8dd9886f5f8a4b0c708cf0c8976 Mon Sep 17 00:00:00 2001
From: Jeremy <jkarpowski@Jeremys-MacBook-Pro.local>
Date: Fri, 20 Mar 2026 21:54:36 +0100
Subject: [PATCH 3/7] matching  MPI and threaded results

---
 .../dgsem_p4est/dg_3d_parabolic_parallel.jl   | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
index b2a979c212f..790c2df4a8a 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
@@ -542,6 +542,7 @@ function calc_mpi_interface_flux_gradient!(surface_flux_values,
 
     return nothing
 end
+
 @inline function calc_mpi_interface_flux_gradient!(surface_flux_values,
                                                    mesh::Union{P4estMeshParallel{3},
                                                                T8codeMeshParallel{3}},
@@ -561,13 +562,8 @@ end
                                        interface_j_node_index,
                                        interface_index)
 
-    if local_side == 1
-        flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
-                               equations_parabolic, parabolic_scheme)
-    else # local_side == 2
-        flux_ = flux_parabolic(u_ll, u_rr, -normal_direction, Gradient(),
-                               equations_parabolic, parabolic_scheme)
-    end
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
 
     for v in eachvariable(equations_parabolic)
         surface_flux_values[v, surface_i_node_index, surface_j_node_index,
@@ -577,7 +573,6 @@ end
     return nothing
 end
 
-
 # This is the version used when calculating the divergence of the viscous fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
 # except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
@@ -1277,19 +1272,16 @@ end
                               interface_i_node_index, interface_j_node_index,
                               interface_index)
 
-    if local_side == 1
+   
         flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
                                normal_direction, Divergence(),
                                equations_parabolic, parabolic_scheme)
-    else
-        flux_ = -flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
-                                -normal_direction, Divergence(),
-                                equations_parabolic, parabolic_scheme)
-    end
+   
+    dirFactor = (local_side==1) ? 1 : -1
 
     for v in eachvariable(equations_parabolic)
         surface_flux_values[v, surface_i_node_index, surface_j_node_index,
-                            local_direction_index, local_element_index] = flux_[v]
+                            local_direction_index, local_element_index] = dirFactor .* flux_[v]
     end
 
     return nothing
@@ -1969,13 +1961,13 @@ function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
                                                         local_element)
 
                 for v in eachvariable(equations_parabolic)
-                    flux_node = SVector(
+                    flux_viscous = SVector(
                         flux_viscous_x[v, i_elem, j_elem, k_elem, local_element],
                         flux_viscous_y[v, i_elem, j_elem, k_elem, local_element],
                         flux_viscous_z[v, i_elem, j_elem, k_elem, local_element]
                     )
 
-                    cache.mpi_interfaces.u[local_side, v, i, j, interface] = orientationFactor .* dot(flux_node, normal_direction)
+                    cache.mpi_interfaces.u[local_side, v, i, j, interface] = orientationFactor .* dot(flux_viscous, normal_direction)
                 end
 
                 i_elem += i_step_i

From 2c8d9b685e787648a8ea79af042c0707f73788b0 Mon Sep 17 00:00:00 2001
From: Jeremy <jkarpowski@Jeremys-MacBook-Pro.local>
Date: Mon, 23 Mar 2026 14:40:41 +0100
Subject: [PATCH 4/7] added indicatorPositional for mortarTesting and fixed
 parabolicViscous mortars

---
 src/callbacks_step/amr.jl                     |  32 +++
 src/callbacks_step/amr_dg2d.jl                |  51 +++++
 src/solvers/dgsem/indicators.jl               |  41 ++++
 .../dgsem_p4est/dg_3d_parabolic_parallel.jl   | 200 +++++++++---------
 src/solvers/dgsem_tree/indicators_1d.jl       |   2 +-
 src/solvers/dgsem_tree/indicators_3d.jl       |  10 +
 6 files changed, 237 insertions(+), 99 deletions(-)

diff --git a/src/callbacks_step/amr.jl b/src/callbacks_step/amr.jl
index 5d28cdacf10..64c2182af8c 100644
--- a/src/callbacks_step/amr.jl
+++ b/src/callbacks_step/amr.jl
@@ -633,6 +633,38 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::P4estMesh,
                 partition!(mesh)
                 rebalance_solver!(u_ode, mesh, equations, dg, cache,
                                   old_global_first_quadrant)
+                
+                # Resync parabolic cache after repartitioning
+                @unpack viscous_container = cache_parabolic
+                resize!(viscous_container, equations, dg, cache)
+
+                if hasproperty(cache_parabolic, :elements)
+                    nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
+                    resize!(cache_parabolic.elements, nelements)
+                    copyto!(cache_parabolic.elements.node_coordinates,      cache.elements.node_coordinates)
+                    copyto!(cache_parabolic.elements.jacobian_matrix,       cache.elements.jacobian_matrix)
+                    copyto!(cache_parabolic.elements.contravariant_vectors, cache.elements.contravariant_vectors)
+                    copyto!(cache_parabolic.elements.inverse_jacobian,      cache.elements.inverse_jacobian)
+                    copyto!(cache_parabolic.elements.surface_flux_values,   cache.elements.surface_flux_values)
+                end
+
+                if hasproperty(cache_parabolic, :interfaces)
+                    ninterfaces = size(cache.interfaces.neighbor_ids, 2)
+                    resize!(cache_parabolic.interfaces, ninterfaces)
+                    copyto!(cache_parabolic.interfaces.u,            cache.interfaces.u)
+                    copyto!(cache_parabolic.interfaces.neighbor_ids, cache.interfaces.neighbor_ids)
+                    copyto!(cache_parabolic.interfaces.node_indices, cache.interfaces.node_indices)
+                end
+
+                if hasproperty(cache_parabolic, :boundaries)
+                    nboundaries = length(cache.boundaries.neighbor_ids)
+                    resize!(cache_parabolic.boundaries, nboundaries)
+                    copyto!(cache_parabolic.boundaries.u,            cache.boundaries.u)
+                    copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
+                    copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
+                    copyto!(cache_parabolic.boundaries.name,         cache.boundaries.name)
+                end
+                
             end
         end
 
diff --git a/src/callbacks_step/amr_dg2d.jl b/src/callbacks_step/amr_dg2d.jl
index 2141c2713bc..91fad708877 100644
--- a/src/callbacks_step/amr_dg2d.jl
+++ b/src/callbacks_step/amr_dg2d.jl
@@ -193,6 +193,32 @@ function refine!(u_ode::AbstractVector, adaptor,
     @unpack viscous_container = cache_parabolic
     resize!(viscous_container, equations, dg, cache)
 
+    # Keep parabolic topology containers in sync if they exist (currently only for P4estMesh thus the conditional
+    if hasproperty(cache_parabolic, :elements)
+        nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
+        resize!(cache_parabolic.elements, nelements)
+
+        copyto!(cache_parabolic.elements.node_coordinates,      cache.elements.node_coordinates)
+        copyto!(cache_parabolic.elements.jacobian_matrix,       cache.elements.jacobian_matrix)
+        copyto!(cache_parabolic.elements.contravariant_vectors, cache.elements.contravariant_vectors)
+        copyto!(cache_parabolic.elements.inverse_jacobian,      cache.elements.inverse_jacobian)
+        copyto!(cache_parabolic.elements.surface_flux_values,   cache.elements.surface_flux_values)
+    end
+    if hasproperty(cache_parabolic, :interfaces)
+        resize!(cache_parabolic.interfaces, size(cache.interfaces.neighbor_ids, 2))
+        copyto!(cache_parabolic.interfaces.u, cache.interfaces.u)
+        copyto!(cache_parabolic.interfaces.neighbor_ids, cache.interfaces.neighbor_ids)
+        copyto!(cache_parabolic.interfaces.node_indices, cache.interfaces.node_indices)
+    end
+
+    if hasproperty(cache_parabolic, :boundaries)
+        resize!(cache_parabolic.boundaries, length(cache.boundaries.neighbor_ids))
+        copyto!(cache_parabolic.boundaries.u,            cache.boundaries.u)
+        copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
+        copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
+        copyto!(cache_parabolic.boundaries.name,         cache.boundaries.name)
+    end
+
     return nothing
 end
 
@@ -387,6 +413,31 @@ function coarsen!(u_ode::AbstractVector, adaptor,
     # Resize parabolic helper variables
     @unpack viscous_container = cache_parabolic
     resize!(viscous_container, equations, dg, cache)
+        # Keep parabolic topology containers in sync if they exist (currently only for P4estMesh thus the conditional
+    if hasproperty(cache_parabolic, :elements)
+        nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
+        resize!(cache_parabolic.elements, nelements)
+
+        copyto!(cache_parabolic.elements.node_coordinates,      cache.elements.node_coordinates)
+        copyto!(cache_parabolic.elements.jacobian_matrix,       cache.elements.jacobian_matrix)
+        copyto!(cache_parabolic.elements.contravariant_vectors, cache.elements.contravariant_vectors)
+        copyto!(cache_parabolic.elements.inverse_jacobian,      cache.elements.inverse_jacobian)
+        copyto!(cache_parabolic.elements.surface_flux_values,   cache.elements.surface_flux_values)
+    end
+    if hasproperty(cache_parabolic, :interfaces)
+        resize!(cache_parabolic.interfaces, size(cache.interfaces.neighbor_ids, 2))
+        copyto!(cache_parabolic.interfaces.u, cache.interfaces.u)
+        copyto!(cache_parabolic.interfaces.neighbor_ids, cache.interfaces.neighbor_ids)
+        copyto!(cache_parabolic.interfaces.node_indices, cache.interfaces.node_indices)
+    end
+
+    if hasproperty(cache_parabolic, :boundaries)
+        resize!(cache_parabolic.boundaries, length(cache.boundaries.neighbor_ids))
+        copyto!(cache_parabolic.boundaries.u,            cache.boundaries.u)
+        copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
+        copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
+        copyto!(cache_parabolic.boundaries.name,         cache.boundaries.name)
+    end
 
     return nothing
 end
diff --git a/src/solvers/dgsem/indicators.jl b/src/solvers/dgsem/indicators.jl
index db1349f854d..713dc92672e 100644
--- a/src/solvers/dgsem/indicators.jl
+++ b/src/solvers/dgsem/indicators.jl
@@ -489,4 +489,45 @@ function Base.show(io::IO, ::MIME"text/plain",
     end
     return nothing
 end
+
+
+"""
+    IndicatorPositional(f)
+
+Create an AMR indicator from a positional rule `f(x, t)`. The usecase for this rule is explict testing and or targeted "static" refinment 
+based on a functional rule. The function `f` is called with the element center `x::SVector{ndims,Float64}` and time `t`.
+"""
+struct IndicatorPositional{F,Cache}<:AbstractIndicator
+    rule::F
+    cache::Cache
+end
+
+# this method is used when the indicator is constructed as for AMR
+function IndicatorPositional(rule,semi::AbstractSemidiscretization)
+    cache = create_cache(IndicatorPositional, semi)
+    return IndicatorPositional{typeof(rule), typeof(cache)}(rule, cache)
+end
+
+function (positional::IndicatorPositional)(u, mesh, equations, dg, cache; t=0.0, iter=0)
+    x = cache.elements.node_coordinates
+    @unpack alpha, center_threaded = positional.cache
+    resize!(alpha, nelements(dg, cache))
+    #### @threaded does not work with cfunction (positional.rule) thus Threads.@threads for now
+    Threads.@threads for element in Trixi.eachelement(dg, cache) 
+        center = center_threaded[Threads.threadid()]
+        fill!(center, 0.0)
+        n = 0
+
+        for k in Trixi.eachnode(dg), j in Trixi.eachnode(dg), i in Trixi.eachnode(dg)
+            center[1] += x[1, i, j, k, element]
+            center[2] += x[2, i, j, k, element]
+            center[3] += x[3, i, j, k, element]
+            n += 1
+        end
+        center ./= n 
+        alpha[element] = positional.rule(center, t)
+    end
+    return alpha
+end
+
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
index 790c2df4a8a..e6cf2189e88 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
@@ -979,14 +979,18 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
                                         equations_parabolic,
                                         mortar_l2::LobattoLegendreMortarL2,
                                         dg::DGSEM)
-    @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
-    @unpack fstar_tmp_threaded = cache
+
+    @unpack node_indices = cache.mpi_mortars
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
 
     flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
 
     @threaded for mortar in eachmpimortar(dg, cache)
+        local_neighbor_ids = cache.mpi_mortars.local_neighbor_ids[mortar]
+        local_neighbor_positions = cache.mpi_mortars.local_neighbor_positions[mortar]
+
+        # Small side indexing
         small_indices = node_indices[1, mortar]
         direction_index = indices2direction(small_indices)
 
@@ -997,108 +1001,108 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
         k_small_start, k_small_step_i, k_small_step_j =
             index_to_start_step_3d(small_indices[3], index_range)
 
-        # Fill u[1, ...] from local small elements
-        for position in 1:4
-            i_small = i_small_start
-            j_small = j_small_start
-            k_small = k_small_start
-
-            element = local_neighbor_ids[mortar][position]
-
-            for j in eachnode(dg)
-                for i in eachnode(dg)
-                    normal_direction = get_normal_direction(direction_index,
-                                                            contravariant_vectors,
-                                                            i_small, j_small, k_small,
-                                                            element)
+        # Large side indexing
+        large_indices = node_indices[2, mortar]
 
-                    for v in eachvariable(equations_parabolic)
-                        flux_viscous_node = SVector(
-                            flux_viscous_x[v, i_small, j_small, k_small, element],
-                            flux_viscous_y[v, i_small, j_small, k_small, element],
-                            flux_viscous_z[v, i_small, j_small, k_small, element]
-                        )
-
-                        cache.mpi_mortars.u[1, v, position, i, j, mortar] =
-                            dot(flux_viscous_node, normal_direction)
+        i_large_start, i_large_step_i, i_large_step_j =
+            index_to_start_step_3d(large_indices[1], index_range)
+        j_large_start, j_large_step_i, j_large_step_j =
+            index_to_start_step_3d(large_indices[2], index_range)
+        k_large_start, k_large_step_i, k_large_step_j =
+            index_to_start_step_3d(large_indices[3], index_range)
+
+        for (element, position) in zip(local_neighbor_ids, local_neighbor_positions)
+
+            if position == 5
+                # =========================
+                # LARGE ELEMENT
+                # =========================
+                u_buffer = cache.u_threaded[Threads.threadid()]
+                fstar_tmp = cache.fstar_tmp_threaded[Threads.threadid()]
+
+                i_large = i_large_start
+                j_large = j_large_start
+                k_large = k_large_start
+
+                for j in eachnode(dg)
+                    for i in eachnode(dg)
+                        normal_direction = get_normal_direction(direction_index,
+                                                                contravariant_vectors,
+                                                                i_large, j_large, k_large,
+                                                                element)
+
+                        for v in eachvariable(equations_parabolic)
+                            flux_node = SVector(
+                                flux_viscous_x[v, i_large, j_large, k_large, element],
+                                flux_viscous_y[v, i_large, j_large, k_large, element],
+                                flux_viscous_z[v, i_large, j_large, k_large, element]
+                            )
+
+                            # same convention as local code
+                            u_buffer[v, i, j] = -0.5f0 * dot(flux_node, normal_direction)
+                        end
+
+                        i_large += i_large_step_i
+                        j_large += j_large_step_i
+                        k_large += k_large_step_i
                     end
-
-                    i_small += i_small_step_i
-                    j_small += j_small_step_i
-                    k_small += k_small_step_i
+                    i_large += i_large_step_j
+                    j_large += j_large_step_j
+                    k_large += k_large_step_j
                 end
-                i_small += i_small_step_j
-                j_small += j_small_step_j
-                k_small += k_small_step_j
-            end
-        end
-
-        # Buffer for large element face values before interpolation
-        u_buffer = cache.u_threaded[Threads.threadid()]
-        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
-
-        # Find the local large element on this MPI mortar, if it exists
-        large_pos = findfirst(==(5), local_neighbor_positions[mortar])
-
-        if large_pos !== nothing
-            large_indices = node_indices[2, mortar]
 
-            i_large_start, i_large_step_i, i_large_step_j =
-                index_to_start_step_3d(large_indices[1], index_range)
-            j_large_start, j_large_step_i, j_large_step_j =
-                index_to_start_step_3d(large_indices[2], index_range)
-            k_large_start, k_large_step_i, k_large_step_j =
-                index_to_start_step_3d(large_indices[3], index_range)
-
-            i_large = i_large_start
-            j_large = j_large_start
-            k_large = k_large_start
-
-            element = local_neighbor_ids[mortar][large_pos]
-
-            for j in eachnode(dg)
-                for i in eachnode(dg)
-                    normal_direction = get_normal_direction(direction_index,
-                                                            contravariant_vectors,
-                                                            i_large, j_large, k_large,
-                                                            element)
-
-                    for v in eachvariable(equations_parabolic)
-                        flux_viscous_node = SVector(
-                            flux_viscous_x[v, i_large, j_large, k_large, element],
-                            flux_viscous_y[v, i_large, j_large, k_large, element],
-                            flux_viscous_z[v, i_large, j_large, k_large, element]
-                        )
-
-                        # same scaling/sign as baseline
-                        u_buffer[v, i, j] = -0.5f0 * dot(flux_viscous_node, normal_direction)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 1, :, :, mortar),
+                                        mortar_l2.forward_lower,
+                                        mortar_l2.forward_lower,
+                                        u_buffer, fstar_tmp)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 2, :, :, mortar),
+                                        mortar_l2.forward_upper,
+                                        mortar_l2.forward_lower,
+                                        u_buffer, fstar_tmp)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 3, :, :, mortar),
+                                        mortar_l2.forward_lower,
+                                        mortar_l2.forward_upper,
+                                        u_buffer, fstar_tmp)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 4, :, :, mortar),
+                                        mortar_l2.forward_upper,
+                                        mortar_l2.forward_upper,
+                                        u_buffer, fstar_tmp)
+
+            else
+                # =========================
+                # SMALL ELEMENT (1–4)
+                # =========================
+                i_small = i_small_start
+                j_small = j_small_start
+                k_small = k_small_start
+
+                for j in eachnode(dg)
+                    for i in eachnode(dg)
+                        normal_direction = get_normal_direction(direction_index,
+                                                                contravariant_vectors,
+                                                                i_small, j_small, k_small,
+                                                                element)
+
+                        for v in eachvariable(equations_parabolic)
+                            flux_node = SVector(
+                                flux_viscous_x[v, i_small, j_small, k_small, element],
+                                flux_viscous_y[v, i_small, j_small, k_small, element],
+                                flux_viscous_z[v, i_small, j_small, k_small, element]
+                            )
+
+                            cache.mpi_mortars.u[1, v, position, i, j, mortar] =
+                                dot(flux_node, normal_direction)
+                        end
+
+                        i_small += i_small_step_i
+                        j_small += j_small_step_i
+                        k_small += k_small_step_i
                     end
-
-                    i_large += i_large_step_i
-                    j_large += j_large_step_i
-                    k_large += k_large_step_i
+                    i_small += i_small_step_j
+                    j_small += j_small_step_j
+                    k_small += k_small_step_j
                 end
-                i_large += i_large_step_j
-                j_large += j_large_step_j
-                k_large += k_large_step_j
             end
-
-            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 1, :, :, mortar),
-                                    mortar_l2.forward_lower,
-                                    mortar_l2.forward_lower,
-                                    u_buffer, fstar_tmp)
-            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 2, :, :, mortar),
-                                    mortar_l2.forward_upper,
-                                    mortar_l2.forward_lower,
-                                    u_buffer, fstar_tmp)
-            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 3, :, :, mortar),
-                                    mortar_l2.forward_lower,
-                                    mortar_l2.forward_upper,
-                                    u_buffer, fstar_tmp)
-            multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 4, :, :, mortar),
-                                    mortar_l2.forward_upper,
-                                    mortar_l2.forward_upper,
-                                    u_buffer, fstar_tmp)
         end
     end
 
@@ -1358,7 +1362,7 @@ end
                                                            fstar, u_buffer, fstar_tmp)
     @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
 
-    mortar_fluxes_to_elements!(surface_flux_values, mesh,
+    mpi_mortar_fluxes_to_elements!(surface_flux_values, mesh,
                                equations_parabolic, mortar_l2, dg, cache,
                                mortar, fstar, fstar, u_buffer, fstar_tmp)
 
diff --git a/src/solvers/dgsem_tree/indicators_1d.jl b/src/solvers/dgsem_tree/indicators_1d.jl
index d72fdd65857..2bdcece8acf 100644
--- a/src/solvers/dgsem_tree/indicators_1d.jl
+++ b/src/solvers/dgsem_tree/indicators_1d.jl
@@ -120,7 +120,7 @@ function create_cache(::Union{Type{IndicatorLöhner}, Type{IndicatorMax}},
 end
 
 # this method is used when the indicator is constructed as for AMR
-function create_cache(typ::Union{Type{IndicatorLöhner}, Type{IndicatorMax}},
+function create_cache(typ::Union{Type{IndicatorLöhner}, Type{IndicatorMax},Type{IndicatorPositional}},
                       mesh, equations::AbstractEquations, dg::DGSEM, cache)
     return create_cache(typ, equations, dg.basis)
 end
diff --git a/src/solvers/dgsem_tree/indicators_3d.jl b/src/solvers/dgsem_tree/indicators_3d.jl
index 05ba46940c6..369478f2a74 100644
--- a/src/solvers/dgsem_tree/indicators_3d.jl
+++ b/src/solvers/dgsem_tree/indicators_3d.jl
@@ -185,6 +185,16 @@ function create_cache(::Union{Type{IndicatorLöhner}, Type{IndicatorMax}},
     return (; alpha, indicator_threaded)
 end
 
+function create_cache(::Type{IndicatorPositional},
+                      equations::AbstractEquations{3}, basis::LobattoLegendreBasis)
+    uEltype = real(basis)
+    alpha = Vector{uEltype}()
+
+    center_threaded = [MVector{3, uEltype}(undef) for _ in 1:Threads.maxthreadid()]
+
+    return (; alpha, center_threaded)
+end
+
 function (löhner::IndicatorLöhner)(u::AbstractArray{<:Any, 5},
                                    mesh, equations, dg::DGSEM, cache;
                                    kwargs...)

From ef5435ee10a68c8c5e916f17f7618deec3a128b8 Mon Sep 17 00:00:00 2001
From: TJP-Karpowski <MessageInGitHubPlease>
Date: Mon, 23 Mar 2026 19:54:46 +0100
Subject: [PATCH 5/7] simplified cache treatment, added mpi mortar testcase

---
 ...erstokes_taylor_green_vortex_amr_mortar.jl | 146 ++++++++++++++++++
 src/Trixi.jl                                  |   2 +-
 ...semidiscretization_hyperbolic_parabolic.jl |   1 +
 src/solvers/dgsem_p4est/dg_2d_parabolic.jl    |  37 ++---
 test/test_mpi_p4est_3d.jl                     |  65 +++++++-
 test/test_parabolic_3d.jl                     |  27 ++++
 6 files changed, 256 insertions(+), 22 deletions(-)
 create mode 100644 examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl

diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
new file mode 100644
index 00000000000..52acda819b1
--- /dev/null
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
@@ -0,0 +1,146 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+# Why: This testcases adds uneven refinment in the taylor green vortex (TGV) to specifically test (MPI) mortar treatment in the parabolic part.
+
+###############################################################################
+# Physics
+
+prandtl_number() = 0.72
+mu = 6.25e-4  # Re ≈ 1600
+
+equations = CompressibleEulerEquations3D(1.4)
+equations_parabolic = CompressibleNavierStokesDiffusion3D(
+    equations, mu = mu, Prandtl = prandtl_number()
+)
+
+###############################################################################
+# Initial condition (Taylor-Green vortex)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A  = 1.0
+    Ms = 0.1
+
+    rho = 1.0
+    v1 =  A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+
+    p = (A / Ms)^2 * rho / equations.gamma
+    p += 1.0 / 16.0 * A^2 * rho *
+         (cos(2x[1]) * cos(2x[3]) +
+          2cos(2x[2]) + 2cos(2x[1]) +
+          cos(2x[2]) * cos(2x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+###############################################################################
+# DG setup
+
+volume_flux = flux_ranocha
+
+solver = DGSEM(
+    polydeg = 3,
+    surface_flux = flux_lax_friedrichs,
+    volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
+)
+
+###############################################################################
+# Mesh
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = ( 1.0,  1.0,  1.0) .* pi
+
+trees_per_dimension = (2, 2, 2)
+
+mesh = P4estMesh(
+    trees_per_dimension,
+    polydeg = 3,
+    coordinates_min = coordinates_min,
+    coordinates_max = coordinates_max,
+    periodicity = (true, true, true),
+    initial_refinement_level = 0
+)
+
+###############################################################################
+# Semidiscretization
+
+semi = SemidiscretizationHyperbolicParabolic(
+    mesh,
+    (equations, equations_parabolic),
+    initial_condition,
+    solver;
+    boundary_conditions = (boundary_condition_periodic,
+                           boundary_condition_periodic)
+)
+
+###############################################################################
+# AMR: positional rule → creates 2 refinment bocks on the diagonal of the cube 
+# Used to explicitly include mortars in the parabolic MPI testscases
+
+function positional_rule(x, t)
+    if ((x[1] < 0) && (x[2] < 0) && (x[3] < 0)) ||
+       ((x[1] > 0) && (x[2] > 0) && (x[3] > 0))
+        return 1.0   # refine
+    else
+        return 0.0   # keep coarse
+    end
+end
+
+amr_indicator = IndicatorPositional(positional_rule, semi)
+
+amr_controller = ControllerThreeLevel(
+    semi, amr_indicator;
+    base_level = 0,
+    med_level  = 1, med_threshold = 0.5,
+    max_level  = 2, max_threshold = 0.9
+)
+
+# refine only once to get a static mortar mesh
+amr_callback = AMRCallback(
+    semi,
+    amr_controller;
+    interval = typemax(Int),                 # no further AMR
+    adapt_initial_condition = true,
+    adapt_initial_condition_only_refine = true
+)
+
+###############################################################################
+# Callbacks 
+analysis_interval = 50
+analysis_callback = AnalysisCallback(
+    semi,
+    interval = analysis_interval,  # effectively disabled for short runs
+    save_analysis = true,
+    extra_analysis_integrals = (energy_kinetic,
+                                energy_internal,
+                                enstrophy)
+)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+callbacks = CallbackSet(
+    analysis_callback,
+    alive_callback,
+    amr_callback
+)
+
+###############################################################################
+# Time integration
+
+tspan = (0.0, 5)
+
+ode = semidiscretize(semi, tspan)
+
+sol = solve(
+    ode,
+    RDPK3SpFSAL49();
+    abstol = 1e-8,
+    reltol = 1e-8,
+    ode_default_options()...,
+    callback = callbacks
+)
\ No newline at end of file
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 167e01189c8..cf8c05b2e72 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -321,7 +321,7 @@ export load_mesh, load_time, load_timestep, load_timestep!, load_dt,
        load_adaptive_time_integrator!
 
 export ControllerThreeLevel, ControllerThreeLevelCombined,
-       IndicatorLöhner, IndicatorLoehner, IndicatorMax
+       IndicatorLöhner, IndicatorLoehner, IndicatorMax, IndicatorPositional
 
 export PositivityPreservingLimiterZhangShu, EntropyBoundedLimiter
 
diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index ff181897740..75c9e4bbc78 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -91,6 +91,7 @@ function SemidiscretizationHyperbolicParabolic(mesh, equations::Tuple,
 
     cache_parabolic = create_cache_parabolic(mesh, equations, solver,
                                              nelements(solver, cache), uEltype)
+    cache_parabolic = (; cache..., cache_parabolic...)                                       
 
     _boundary_conditions_parabolic = digest_boundary_conditions(boundary_conditions_parabolic,
                                                                 mesh, solver, cache)
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 4caba61d369..23b91b3de58 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -33,27 +33,24 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     @unpack viscous_container = cache_parabolic
     @unpack u_transformed, gradients, flux_viscous = viscous_container
 
-    cache_p = (; cache...,
-            elements   = cache_parabolic.elements,
-            interfaces = cache_parabolic.interfaces,
-            boundaries = cache_parabolic.boundaries)
+
     # Convert conservative variables to a form more suitable for viscous flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
-                             dg, cache_p)
+                             dg, cache_parabolic)
     end
 
     # Compute the gradients of the transformed variables
     @trixi_timeit timer() "calculate gradient" begin
         calc_gradient!(gradients, u_transformed, t, mesh,
                        equations_parabolic, boundary_conditions_parabolic,
-                       dg, parabolic_scheme, cache_p)
+                       dg, parabolic_scheme, cache_parabolic)
     end
 
     # Compute and store the viscous fluxes
     @trixi_timeit timer() "calculate viscous fluxes" begin
         calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
-                             equations_parabolic, dg, cache_p)
+                             equations_parabolic, dg, cache_parabolic)
     end
 
     # The remainder of this function is essentially a regular rhs! for parabolic
@@ -69,41 +66,41 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # need to interpolate solutions *and* gradients to the surfaces.
 
     # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache_p)
+    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache_parabolic)
 
     # Calculate volume integral
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache_p)
+        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache_parabolic)
     end
 
     # Prolong solution to interfaces.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache_parabolic, flux_viscous, mesh, equations_parabolic, dg)
     end
 
     # Calculate interface fluxes
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache_p.elements.surface_flux_values, mesh,
-                             equations_parabolic, dg, parabolic_scheme, cache_p)
+        calc_interface_flux!(cache_parabolic.elements.surface_flux_values, mesh,
+                             equations_parabolic, dg, parabolic_scheme, cache_parabolic)
     end
 
     # Prolong viscous fluxes to boundaries.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache_parabolic, flux_viscous, mesh, equations_parabolic, dg)
     end
 
     # Calculate boundary fluxes.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux_divergence!(cache_p, t,
+        calc_boundary_flux_divergence!(cache_parabolic, t,
                                        boundary_conditions_parabolic, mesh,
                                        equations_parabolic,
                                        dg.surface_integral, dg)
@@ -112,33 +109,33 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Prolong solution to mortars.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars_divergence!(cache_p, flux_viscous, mesh, equations_parabolic,
+        prolong2mortars_divergence!(cache_parabolic, flux_viscous, mesh, equations_parabolic,
                                     dg.mortar, dg)
     end
 
     # Calculate mortar fluxes.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "mortar flux" begin
-        calc_mortar_flux_divergence!(cache_p.elements.surface_flux_values,
+        calc_mortar_flux_divergence!(cache_parabolic.elements.surface_flux_values,
                                      mesh, equations_parabolic, dg.mortar,
-                                     dg, parabolic_scheme, cache_p)
+                                     dg, parabolic_scheme, cache_parabolic)
     end
 
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
         calc_surface_integral!(du, u, mesh, equations_parabolic,
-                               dg.surface_integral, dg, cache_p)
+                               dg.surface_integral, dg, cache_parabolic)
     end
 
     # Apply Jacobian from mapping to reference element
     @trixi_timeit timer() "Jacobian" begin
-        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache_p)
+        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache_parabolic)
     end
 
     @trixi_timeit timer() "source terms parabolic" begin
         calc_sources_parabolic!(du, u, gradients, t, source_terms_parabolic,
-                                equations_parabolic, dg, cache_p)
+                                equations_parabolic, dg, cache_parabolic)
     end
 
     return nothing
diff --git a/test/test_mpi_p4est_3d.jl b/test/test_mpi_p4est_3d.jl
index 11086b7f593..77dd0af71cd 100644
--- a/test/test_mpi_p4est_3d.jl
+++ b/test/test_mpi_p4est_3d.jl
@@ -250,7 +250,70 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
-end
+
+
+
+    @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl" begin
+        @test_trixi_include(joinpath(EXAMPLES_DIR, 
+                                    "elixir_navierstokes_taylor_green_vortex.jl"),
+                            initial_refinement_level=2, tspan=(0.0, 0.25),
+                            surface_flux=FluxHLL(min_max_speed_naive),
+                            l2=[
+                                0.0001547509861140407,
+                                0.015637861347119624,
+                                0.015637861347119687,
+                                0.022024699158522523,
+                                0.009711013505930812
+                            ],
+                            linf=[
+                                0.0006696415247340326,
+                                0.03442565722527785,
+                                0.03442565722577423,
+                                0.06295407168705314,
+                                0.032857472756916195
+                            ])
+        # Ensure that we do not have excessive memory allocations
+        # (e.g., from type instabilities)
+        @test_allocations(Trixi.rhs!, semi, sol, 1000)
+        @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+    end
+
+
+    @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl (Diffusive CFL)" begin
+        @test_trixi_include(joinpath(EXAMPLES_DIR, 
+                                    "elixir_navierstokes_taylor_green_vortex.jl"),
+                            tspan=(0.0, 0.1),
+                            mu=0.5, # render flow diffusion-dominated
+                            callbacks=CallbackSet(summary_callback, analysis_callback,
+                                                alive_callback,
+                                                StepsizeCallback(cfl = 2.3,
+                                                                cfl_diffusive = 0.4)),
+                            adaptive=false, # respect CFL
+                            ode_alg=CKLLSRK95_4S(),
+                            l2=[
+                                0.0001022410497625877,
+                                0.04954975879887512,
+                                0.049549758798875056,
+                                0.005853983721675305,
+                                0.09161121143324424
+                            ],
+                            linf=[
+                                0.00039284994602417633,
+                                0.14026307274342587,
+                                0.14026307274350203,
+                                0.017003338595870714,
+                                0.2823457296549634
+                            ])
+        # Ensure that we do not have excessive memory allocations
+        # (e.g., from type instabilities)
+        @test_allocations(Trixi.rhs!, semi, sol, 1000)
+        @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+    end
+
+
+
+end # 3D Testcases 
+
 end # P4estMesh MPI
 
 end # module
diff --git a/test/test_parabolic_3d.jl b/test/test_parabolic_3d.jl
index f8aa7e85afa..f7955f7cff8 100644
--- a/test/test_parabolic_3d.jl
+++ b/test/test_parabolic_3d.jl
@@ -439,6 +439,33 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex_amr_mortar.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,"p4est_3d_dgsem",
+                                 "elixir_navierstokes_taylor_green_vortex_amr_mortar.jl"),
+                        tspan=(0.0, 5.0),
+                        l2=[
+                            0.001716903538135598,
+                            0.2566781540081305,
+                            0.2566781540081202,
+                            0.22556320347015937,
+                            0.33779107381452556
+                        ],
+                        linf=[
+                            0.01139265672066836,
+                            1.4145605813055888,
+                            1.4145605813076223,
+                            1.070081196321056,
+                            3.332907472310012
+                        ])
+
+    @test isapprox(last(summary_callback.analysis_integrals[:enstrophy]),
+                   1.6598233348361535;
+                   rtol=1.0e-12, atol=1.0e-14)
+
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh3D: elixir_advection_diffusion_amr.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_3d_dgsem",
                                  "elixir_advection_diffusion_amr.jl"),

From bf9c54f364d75362acb5398ee60fafe213bbd1b0 Mon Sep 17 00:00:00 2001
From: "T. Jeremy P. Karpowski" <jeremy.karpowskilgmail.com>
Date: Mon, 23 Mar 2026 21:59:31 +0100
Subject: [PATCH 6/7] applied formatting

---
 ...erstokes_taylor_green_vortex_amr_mortar.jl | 105 +++----
 src/callbacks_step/amr.jl                     |  36 ++-
 src/callbacks_step/amr_dg2d.jl                |  40 ++-
 ...semidiscretization_hyperbolic_parabolic.jl |   2 +-
 src/solvers/dgsem/indicators.jl               |  13 +-
 src/solvers/dgsem_p4est/dg.jl                 | 144 ++++-----
 src/solvers/dgsem_p4est/dg_2d_parabolic.jl    |  13 +-
 .../dgsem_p4est/dg_3d_parabolic_parallel.jl   | 288 ++++++++++--------
 src/solvers/dgsem_p4est/dg_parallel.jl        |   5 +-
 test/test_mpi_p4est_3d.jl                     |  21 +-
 test/test_parabolic_3d.jl                     |   4 +-
 11 files changed, 343 insertions(+), 328 deletions(-)

diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
index 52acda819b1..0196a772d9b 100644
--- a/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
@@ -10,20 +10,19 @@ prandtl_number() = 0.72
 mu = 6.25e-4  # Re ≈ 1600
 
 equations = CompressibleEulerEquations3D(1.4)
-equations_parabolic = CompressibleNavierStokesDiffusion3D(
-    equations, mu = mu, Prandtl = prandtl_number()
-)
+equations_parabolic = CompressibleNavierStokesDiffusion3D(equations, mu = mu,
+                                                          Prandtl = prandtl_number())
 
 ###############################################################################
 # Initial condition (Taylor-Green vortex)
 
 function initial_condition_taylor_green_vortex(x, t,
                                                equations::CompressibleEulerEquations3D)
-    A  = 1.0
+    A = 1.0
     Ms = 0.1
 
     rho = 1.0
-    v1 =  A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
     v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
     v3 = 0.0
 
@@ -43,40 +42,34 @@ initial_condition = initial_condition_taylor_green_vortex
 
 volume_flux = flux_ranocha
 
-solver = DGSEM(
-    polydeg = 3,
-    surface_flux = flux_lax_friedrichs,
-    volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
-)
+solver = DGSEM(polydeg = 3,
+               surface_flux = flux_lax_friedrichs,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
 
 ###############################################################################
 # Mesh
 
 coordinates_min = (-1.0, -1.0, -1.0) .* pi
-coordinates_max = ( 1.0,  1.0,  1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
 
 trees_per_dimension = (2, 2, 2)
 
-mesh = P4estMesh(
-    trees_per_dimension,
-    polydeg = 3,
-    coordinates_min = coordinates_min,
-    coordinates_max = coordinates_max,
-    periodicity = (true, true, true),
-    initial_refinement_level = 0
-)
+mesh = P4estMesh(trees_per_dimension,
+                 polydeg = 3,
+                 coordinates_min = coordinates_min,
+                 coordinates_max = coordinates_max,
+                 periodicity = (true, true, true),
+                 initial_refinement_level = 0)
 
 ###############################################################################
 # Semidiscretization
 
-semi = SemidiscretizationHyperbolicParabolic(
-    mesh,
-    (equations, equations_parabolic),
-    initial_condition,
-    solver;
-    boundary_conditions = (boundary_condition_periodic,
-                           boundary_condition_periodic)
-)
+semi = SemidiscretizationHyperbolicParabolic(mesh,
+                                             (equations, equations_parabolic),
+                                             initial_condition,
+                                             solver;
+                                             boundary_conditions = (boundary_condition_periodic,
+                                                                    boundary_condition_periodic))
 
 ###############################################################################
 # AMR: positional rule → creates 2 refinment bocks on the diagonal of the cube 
@@ -93,41 +86,33 @@ end
 
 amr_indicator = IndicatorPositional(positional_rule, semi)
 
-amr_controller = ControllerThreeLevel(
-    semi, amr_indicator;
-    base_level = 0,
-    med_level  = 1, med_threshold = 0.5,
-    max_level  = 2, max_threshold = 0.9
-)
+amr_controller = ControllerThreeLevel(semi, amr_indicator;
+                                      base_level = 0,
+                                      med_level = 1, med_threshold = 0.5,
+                                      max_level = 2, max_threshold = 0.9)
 
 # refine only once to get a static mortar mesh
-amr_callback = AMRCallback(
-    semi,
-    amr_controller;
-    interval = typemax(Int),                 # no further AMR
-    adapt_initial_condition = true,
-    adapt_initial_condition_only_refine = true
-)
+amr_callback = AMRCallback(semi,
+                           amr_controller;
+                           interval = typemax(Int),                 # no further AMR
+                           adapt_initial_condition = true,
+                           adapt_initial_condition_only_refine = true)
 
 ###############################################################################
 # Callbacks 
 analysis_interval = 50
-analysis_callback = AnalysisCallback(
-    semi,
-    interval = analysis_interval,  # effectively disabled for short runs
-    save_analysis = true,
-    extra_analysis_integrals = (energy_kinetic,
-                                energy_internal,
-                                enstrophy)
-)
+analysis_callback = AnalysisCallback(semi,
+                                     interval = analysis_interval,  # effectively disabled for short runs
+                                     save_analysis = true,
+                                     extra_analysis_integrals = (energy_kinetic,
+                                                                 energy_internal,
+                                                                 enstrophy))
 
 alive_callback = AliveCallback(analysis_interval = analysis_interval)
 
-callbacks = CallbackSet(
-    analysis_callback,
-    alive_callback,
-    amr_callback
-)
+callbacks = CallbackSet(analysis_callback,
+                        alive_callback,
+                        amr_callback)
 
 ###############################################################################
 # Time integration
@@ -136,11 +121,9 @@ tspan = (0.0, 5)
 
 ode = semidiscretize(semi, tspan)
 
-sol = solve(
-    ode,
-    RDPK3SpFSAL49();
-    abstol = 1e-8,
-    reltol = 1e-8,
-    ode_default_options()...,
-    callback = callbacks
-)
\ No newline at end of file
+sol = solve(ode,
+            RDPK3SpFSAL49();
+            abstol = 1e-8,
+            reltol = 1e-8,
+            ode_default_options()...,
+            callback = callbacks)
diff --git a/src/callbacks_step/amr.jl b/src/callbacks_step/amr.jl
index 64c2182af8c..fdc9e5a0eb0 100644
--- a/src/callbacks_step/amr.jl
+++ b/src/callbacks_step/amr.jl
@@ -633,7 +633,7 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::P4estMesh,
                 partition!(mesh)
                 rebalance_solver!(u_ode, mesh, equations, dg, cache,
                                   old_global_first_quadrant)
-                
+
                 # Resync parabolic cache after repartitioning
                 @unpack viscous_container = cache_parabolic
                 resize!(viscous_container, equations, dg, cache)
@@ -641,30 +641,38 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::P4estMesh,
                 if hasproperty(cache_parabolic, :elements)
                     nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
                     resize!(cache_parabolic.elements, nelements)
-                    copyto!(cache_parabolic.elements.node_coordinates,      cache.elements.node_coordinates)
-                    copyto!(cache_parabolic.elements.jacobian_matrix,       cache.elements.jacobian_matrix)
-                    copyto!(cache_parabolic.elements.contravariant_vectors, cache.elements.contravariant_vectors)
-                    copyto!(cache_parabolic.elements.inverse_jacobian,      cache.elements.inverse_jacobian)
-                    copyto!(cache_parabolic.elements.surface_flux_values,   cache.elements.surface_flux_values)
+                    copyto!(cache_parabolic.elements.node_coordinates,
+                            cache.elements.node_coordinates)
+                    copyto!(cache_parabolic.elements.jacobian_matrix,
+                            cache.elements.jacobian_matrix)
+                    copyto!(cache_parabolic.elements.contravariant_vectors,
+                            cache.elements.contravariant_vectors)
+                    copyto!(cache_parabolic.elements.inverse_jacobian,
+                            cache.elements.inverse_jacobian)
+                    copyto!(cache_parabolic.elements.surface_flux_values,
+                            cache.elements.surface_flux_values)
                 end
 
                 if hasproperty(cache_parabolic, :interfaces)
                     ninterfaces = size(cache.interfaces.neighbor_ids, 2)
                     resize!(cache_parabolic.interfaces, ninterfaces)
-                    copyto!(cache_parabolic.interfaces.u,            cache.interfaces.u)
-                    copyto!(cache_parabolic.interfaces.neighbor_ids, cache.interfaces.neighbor_ids)
-                    copyto!(cache_parabolic.interfaces.node_indices, cache.interfaces.node_indices)
+                    copyto!(cache_parabolic.interfaces.u, cache.interfaces.u)
+                    copyto!(cache_parabolic.interfaces.neighbor_ids,
+                            cache.interfaces.neighbor_ids)
+                    copyto!(cache_parabolic.interfaces.node_indices,
+                            cache.interfaces.node_indices)
                 end
 
                 if hasproperty(cache_parabolic, :boundaries)
                     nboundaries = length(cache.boundaries.neighbor_ids)
                     resize!(cache_parabolic.boundaries, nboundaries)
-                    copyto!(cache_parabolic.boundaries.u,            cache.boundaries.u)
-                    copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
-                    copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
-                    copyto!(cache_parabolic.boundaries.name,         cache.boundaries.name)
+                    copyto!(cache_parabolic.boundaries.u, cache.boundaries.u)
+                    copyto!(cache_parabolic.boundaries.neighbor_ids,
+                            cache.boundaries.neighbor_ids)
+                    copyto!(cache_parabolic.boundaries.node_indices,
+                            cache.boundaries.node_indices)
+                    copyto!(cache_parabolic.boundaries.name, cache.boundaries.name)
                 end
-                
             end
         end
 
diff --git a/src/callbacks_step/amr_dg2d.jl b/src/callbacks_step/amr_dg2d.jl
index 91fad708877..6795a752063 100644
--- a/src/callbacks_step/amr_dg2d.jl
+++ b/src/callbacks_step/amr_dg2d.jl
@@ -198,11 +198,16 @@ function refine!(u_ode::AbstractVector, adaptor,
         nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
         resize!(cache_parabolic.elements, nelements)
 
-        copyto!(cache_parabolic.elements.node_coordinates,      cache.elements.node_coordinates)
-        copyto!(cache_parabolic.elements.jacobian_matrix,       cache.elements.jacobian_matrix)
-        copyto!(cache_parabolic.elements.contravariant_vectors, cache.elements.contravariant_vectors)
-        copyto!(cache_parabolic.elements.inverse_jacobian,      cache.elements.inverse_jacobian)
-        copyto!(cache_parabolic.elements.surface_flux_values,   cache.elements.surface_flux_values)
+        copyto!(cache_parabolic.elements.node_coordinates,
+                cache.elements.node_coordinates)
+        copyto!(cache_parabolic.elements.jacobian_matrix,
+                cache.elements.jacobian_matrix)
+        copyto!(cache_parabolic.elements.contravariant_vectors,
+                cache.elements.contravariant_vectors)
+        copyto!(cache_parabolic.elements.inverse_jacobian,
+                cache.elements.inverse_jacobian)
+        copyto!(cache_parabolic.elements.surface_flux_values,
+                cache.elements.surface_flux_values)
     end
     if hasproperty(cache_parabolic, :interfaces)
         resize!(cache_parabolic.interfaces, size(cache.interfaces.neighbor_ids, 2))
@@ -213,10 +218,10 @@ function refine!(u_ode::AbstractVector, adaptor,
 
     if hasproperty(cache_parabolic, :boundaries)
         resize!(cache_parabolic.boundaries, length(cache.boundaries.neighbor_ids))
-        copyto!(cache_parabolic.boundaries.u,            cache.boundaries.u)
+        copyto!(cache_parabolic.boundaries.u, cache.boundaries.u)
         copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
         copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
-        copyto!(cache_parabolic.boundaries.name,         cache.boundaries.name)
+        copyto!(cache_parabolic.boundaries.name, cache.boundaries.name)
     end
 
     return nothing
@@ -413,16 +418,21 @@ function coarsen!(u_ode::AbstractVector, adaptor,
     # Resize parabolic helper variables
     @unpack viscous_container = cache_parabolic
     resize!(viscous_container, equations, dg, cache)
-        # Keep parabolic topology containers in sync if they exist (currently only for P4estMesh thus the conditional
+    # Keep parabolic topology containers in sync if they exist (currently only for P4estMesh thus the conditional
     if hasproperty(cache_parabolic, :elements)
         nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
         resize!(cache_parabolic.elements, nelements)
 
-        copyto!(cache_parabolic.elements.node_coordinates,      cache.elements.node_coordinates)
-        copyto!(cache_parabolic.elements.jacobian_matrix,       cache.elements.jacobian_matrix)
-        copyto!(cache_parabolic.elements.contravariant_vectors, cache.elements.contravariant_vectors)
-        copyto!(cache_parabolic.elements.inverse_jacobian,      cache.elements.inverse_jacobian)
-        copyto!(cache_parabolic.elements.surface_flux_values,   cache.elements.surface_flux_values)
+        copyto!(cache_parabolic.elements.node_coordinates,
+                cache.elements.node_coordinates)
+        copyto!(cache_parabolic.elements.jacobian_matrix,
+                cache.elements.jacobian_matrix)
+        copyto!(cache_parabolic.elements.contravariant_vectors,
+                cache.elements.contravariant_vectors)
+        copyto!(cache_parabolic.elements.inverse_jacobian,
+                cache.elements.inverse_jacobian)
+        copyto!(cache_parabolic.elements.surface_flux_values,
+                cache.elements.surface_flux_values)
     end
     if hasproperty(cache_parabolic, :interfaces)
         resize!(cache_parabolic.interfaces, size(cache.interfaces.neighbor_ids, 2))
@@ -433,10 +443,10 @@ function coarsen!(u_ode::AbstractVector, adaptor,
 
     if hasproperty(cache_parabolic, :boundaries)
         resize!(cache_parabolic.boundaries, length(cache.boundaries.neighbor_ids))
-        copyto!(cache_parabolic.boundaries.u,            cache.boundaries.u)
+        copyto!(cache_parabolic.boundaries.u, cache.boundaries.u)
         copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
         copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
-        copyto!(cache_parabolic.boundaries.name,         cache.boundaries.name)
+        copyto!(cache_parabolic.boundaries.name, cache.boundaries.name)
     end
 
     return nothing
diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index 75c9e4bbc78..b444c0d7cd7 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -91,7 +91,7 @@ function SemidiscretizationHyperbolicParabolic(mesh, equations::Tuple,
 
     cache_parabolic = create_cache_parabolic(mesh, equations, solver,
                                              nelements(solver, cache), uEltype)
-    cache_parabolic = (; cache..., cache_parabolic...)                                       
+    cache_parabolic = (; cache..., cache_parabolic...)
 
     _boundary_conditions_parabolic = digest_boundary_conditions(boundary_conditions_parabolic,
                                                                 mesh, solver, cache)
diff --git a/src/solvers/dgsem/indicators.jl b/src/solvers/dgsem/indicators.jl
index 713dc92672e..43c73bdc3f2 100644
--- a/src/solvers/dgsem/indicators.jl
+++ b/src/solvers/dgsem/indicators.jl
@@ -490,30 +490,30 @@ function Base.show(io::IO, ::MIME"text/plain",
     return nothing
 end
 
-
 """
     IndicatorPositional(f)
 
 Create an AMR indicator from a positional rule `f(x, t)`. The usecase for this rule is explict testing and or targeted "static" refinment 
 based on a functional rule. The function `f` is called with the element center `x::SVector{ndims,Float64}` and time `t`.
 """
-struct IndicatorPositional{F,Cache}<:AbstractIndicator
+struct IndicatorPositional{F, Cache} <: AbstractIndicator
     rule::F
     cache::Cache
 end
 
 # this method is used when the indicator is constructed as for AMR
-function IndicatorPositional(rule,semi::AbstractSemidiscretization)
+function IndicatorPositional(rule, semi::AbstractSemidiscretization)
     cache = create_cache(IndicatorPositional, semi)
     return IndicatorPositional{typeof(rule), typeof(cache)}(rule, cache)
 end
 
-function (positional::IndicatorPositional)(u, mesh, equations, dg, cache; t=0.0, iter=0)
+function (positional::IndicatorPositional)(u, mesh, equations, dg, cache; t = 0.0,
+                                           iter = 0)
     x = cache.elements.node_coordinates
     @unpack alpha, center_threaded = positional.cache
     resize!(alpha, nelements(dg, cache))
     #### @threaded does not work with cfunction (positional.rule) thus Threads.@threads for now
-    Threads.@threads for element in Trixi.eachelement(dg, cache) 
+    Threads.@threads for element in Trixi.eachelement(dg, cache)
         center = center_threaded[Threads.threadid()]
         fill!(center, 0.0)
         n = 0
@@ -524,10 +524,9 @@ function (positional::IndicatorPositional)(u, mesh, equations, dg, cache; t=0.0,
             center[3] += x[3, i, j, k, element]
             n += 1
         end
-        center ./= n 
+        center ./= n
         alpha[element] = positional.rule(center, t)
     end
     return alpha
 end
-
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg.jl b/src/solvers/dgsem_p4est/dg.jl
index 9f0670ee2b1..a74f2a9d4d6 100644
--- a/src/solvers/dgsem_p4est/dg.jl
+++ b/src/solvers/dgsem_p4est/dg.jl
@@ -8,73 +8,73 @@
 # This method is called when a SemidiscretizationHyperbolic is constructed.
 # It constructs the basic `cache` used throughout the simulation to compute
 # the RHS etc.
-    function create_cache(mesh::P4estMesh, equations::AbstractEquations, dg::DG, ::Any,
-                        ::Type{uEltype}) where {uEltype <: Real}
-        # Make sure to balance the `p4est` before creating any containers
-        # in case someone has tampered with the `p4est` after creating the mesh
-        balance!(mesh)
-
-        elements = init_elements(mesh, equations, dg.basis, uEltype)
-        interfaces = init_interfaces(mesh, equations, dg.basis, elements)
-        boundaries = init_boundaries(mesh, equations, dg.basis, elements)
-        mortars = init_mortars(mesh, equations, dg.basis, elements)
-
-        # Container cache
-        cache = (; elements, interfaces, boundaries, mortars)
-
-        # Add Volume-Integral cache
-        cache = (; cache...,
-                create_cache(mesh, equations, dg.volume_integral, dg, cache, uEltype)...)
-        # Add Mortar cache
-        cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
-
-        return cache
-    end
+function create_cache(mesh::P4estMesh, equations::AbstractEquations, dg::DG, ::Any,
+                      ::Type{uEltype}) where {uEltype <: Real}
+    # Make sure to balance the `p4est` before creating any containers
+    # in case someone has tampered with the `p4est` after creating the mesh
+    balance!(mesh)
 
-    # This method is called when a SemidiscretizationHyperbolic is constructed.
-    # It constructs the basic `cache` used throughout the simulation to compute
-    # the RHS etc.
-    function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG, ::Any,
-                        ::Type{uEltype}) where {uEltype <: Real}
-        # Make sure to balance the `p4est` before creating any containers
-        # in case someone has tampered with the `p4est` after creating the mesh
-        balance!(mesh.parent)
-
-        elements_parent = init_elements(mesh.parent, equations, dg.basis, uEltype)
-        interfaces_parent = init_interfaces(mesh.parent, equations, dg.basis,
-                                            elements_parent)
-        boundaries_parent = init_boundaries(mesh.parent, equations, dg.basis,
-                                            elements_parent)
-        mortars_parent = init_mortars(mesh.parent, equations, dg.basis, elements_parent)
-
-        # Extract data for views.
-        elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
-                                                                                                interfaces_parent,
-                                                                                                boundaries_parent,
-                                                                                                mortars_parent,
-                                                                                                mesh,
-                                                                                                equations,
-                                                                                                dg,
-                                                                                                uEltype)
-
-        cache = (; elements, interfaces, boundaries, mortars, neighbor_ids_parent)
-
-        # Add Volume-Integral cache
-        cache = (; cache...,
-                create_cache(mesh, equations, dg.volume_integral, dg, uEltype)...)
-        # Add Mortar cache
-        cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
-
-        return cache
-    end
+    elements = init_elements(mesh, equations, dg.basis, uEltype)
+    interfaces = init_interfaces(mesh, equations, dg.basis, elements)
+    boundaries = init_boundaries(mesh, equations, dg.basis, elements)
+    mortars = init_mortars(mesh, equations, dg.basis, elements)
+
+    # Container cache
+    cache = (; elements, interfaces, boundaries, mortars)
+
+    # Add Volume-Integral cache
+    cache = (; cache...,
+             create_cache(mesh, equations, dg.volume_integral, dg, cache, uEltype)...)
+    # Add Mortar cache
+    cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
+
+    return cache
+end
+
+# This method is called when a SemidiscretizationHyperbolic is constructed.
+# It constructs the basic `cache` used throughout the simulation to compute
+# the RHS etc.
+function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG, ::Any,
+                      ::Type{uEltype}) where {uEltype <: Real}
+    # Make sure to balance the `p4est` before creating any containers
+    # in case someone has tampered with the `p4est` after creating the mesh
+    balance!(mesh.parent)
+
+    elements_parent = init_elements(mesh.parent, equations, dg.basis, uEltype)
+    interfaces_parent = init_interfaces(mesh.parent, equations, dg.basis,
+                                        elements_parent)
+    boundaries_parent = init_boundaries(mesh.parent, equations, dg.basis,
+                                        elements_parent)
+    mortars_parent = init_mortars(mesh.parent, equations, dg.basis, elements_parent)
+
+    # Extract data for views.
+    elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
+                                                                                             interfaces_parent,
+                                                                                             boundaries_parent,
+                                                                                             mortars_parent,
+                                                                                             mesh,
+                                                                                             equations,
+                                                                                             dg,
+                                                                                             uEltype)
+
+    cache = (; elements, interfaces, boundaries, mortars, neighbor_ids_parent)
+
+    # Add Volume-Integral cache
+    cache = (; cache...,
+             create_cache(mesh, equations, dg.volume_integral, dg, uEltype)...)
+    # Add Mortar cache
+    cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
+
+    return cache
+end
 
 function create_cache_parabolic(mesh::P4estMesh,
                                 equations_hyperbolic::AbstractEquations,
-                                dg::DG,  n_elements, uEltype)
+                                dg::DG, n_elements, uEltype)
     # Keep consistent with the baseline p4est cache
     balance!(mesh)
 
-    elements   = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
+    elements = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
     interfaces = init_interfaces(mesh, equations_hyperbolic, dg.basis, elements)
     boundaries = init_boundaries(mesh, equations_hyperbolic, dg.basis, elements)
     viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
@@ -87,22 +87,25 @@ end
 function create_cache_parabolic(mesh::P4estMeshView,
                                 equations_hyperbolic::AbstractEquations,
                                 dg::DG, n_elements, uEltype)
-    
-
     balance!(mesh.parent)
 
-    elements_parent   = init_elements(mesh.parent, equations_hyperbolic, dg.basis, uEltype)
+    elements_parent = init_elements(mesh.parent, equations_hyperbolic, dg.basis,
+                                    uEltype)
     interfaces_parent = init_interfaces(mesh.parent, equations_hyperbolic, dg.basis,
                                         elements_parent)
     boundaries_parent = init_boundaries(mesh.parent, equations_hyperbolic, dg.basis,
                                         elements_parent)
-    mortars_parent    = init_mortars(mesh.parent, equations_hyperbolic, dg.basis,
-                                     elements_parent)
-
-    elements, interfaces, boundaries, mortars, neighbor_ids_parent =
-        extract_p4est_mesh_view(elements_parent, interfaces_parent,
-                                boundaries_parent, mortars_parent,
-                                mesh, equations_hyperbolic, dg, uEltype)
+    mortars_parent = init_mortars(mesh.parent, equations_hyperbolic, dg.basis,
+                                  elements_parent)
+
+    elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
+                                                                                             interfaces_parent,
+                                                                                             boundaries_parent,
+                                                                                             mortars_parent,
+                                                                                             mesh,
+                                                                                             equations_hyperbolic,
+                                                                                             dg,
+                                                                                             uEltype)
 
     viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
                                                   nnodes(dg), n_elements,
@@ -111,7 +114,6 @@ function create_cache_parabolic(mesh::P4estMeshView,
     return (; elements, interfaces, boundaries, neighbor_ids_parent, viscous_container)
 end
 
-
 # Extract outward-pointing normal direction
 # (contravariant vector ±Ja^i, i = index)
 # Note that this vector is not normalized
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 23b91b3de58..50ff5893843 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -33,7 +33,6 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     @unpack viscous_container = cache_parabolic
     @unpack u_transformed, gradients, flux_viscous = viscous_container
 
-
     # Convert conservative variables to a form more suitable for viscous flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
@@ -72,14 +71,16 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache_parabolic)
+        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg,
+                              cache_parabolic)
     end
 
     # Prolong solution to interfaces.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache_parabolic, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache_parabolic, flux_viscous, mesh, equations_parabolic,
+                            dg)
     end
 
     # Calculate interface fluxes
@@ -94,7 +95,8 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache_parabolic, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache_parabolic, flux_viscous, mesh, equations_parabolic,
+                            dg)
     end
 
     # Calculate boundary fluxes.
@@ -109,7 +111,8 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Prolong solution to mortars.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars_divergence!(cache_parabolic, flux_viscous, mesh, equations_parabolic,
+        prolong2mortars_divergence!(cache_parabolic, flux_viscous, mesh,
+                                    equations_parabolic,
                                     dg.mortar, dg)
     end
 
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
index e6cf2189e88..0b57ccc560f 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
@@ -5,10 +5,8 @@
 @muladd begin
 #! format: noindent
 
-
-
-
-function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+function rhs_parabolic!(du, u, t,
+                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
                         equations_parabolic::AbstractEquationsParabolic,
                         boundary_conditions_parabolic, source_terms_parabolic,
                         dg::DG, parabolic_scheme, cache, cache_parabolic)
@@ -16,9 +14,9 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
     @unpack u_transformed, gradients, flux_viscous = viscous_container
 
     cache_p = (; cache...,
-                elements   = cache_parabolic.elements,
-                interfaces = cache_parabolic.interfaces,
-                boundaries = cache_parabolic.boundaries)
+               elements = cache_parabolic.elements,
+               interfaces = cache_parabolic.interfaces,
+               boundaries = cache_parabolic.boundaries)
 
     # Stage 0: local variable transform
     #
@@ -75,14 +73,14 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
                                           dg, parabolic_scheme, cache_p)
     end
 
-        # MPI mortar fluxes for gradient stage
-        @trixi_timeit timer() "MPI mortar flux gradient" begin
-            calc_mpi_mortar_flux_gradient!(cache_p.elements.surface_flux_values,
-                                        mesh, equations_parabolic, dg.mortar,
-                                        dg, parabolic_scheme, cache_p)
-        end
+    # MPI mortar fluxes for gradient stage
+    @trixi_timeit timer() "MPI mortar flux gradient" begin
+        calc_mpi_mortar_flux_gradient!(cache_p.elements.surface_flux_values,
+                                       mesh, equations_parabolic, dg.mortar,
+                                       dg, parabolic_scheme, cache_p)
+    end
 
-        # Calculate surface integrals
+    # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
         calc_surface_integral_gradient!(gradients, mesh, equations_parabolic,
                                         dg, cache_p)
@@ -213,13 +211,8 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMeshParallel{3}, T8codeMeshPa
     return nothing
 end
 
-
-
-
-
-
 function calc_gradient!(gradients, u_transformed, t,
-                         mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
                         equations_parabolic, boundary_conditions_parabolic,
                         dg::DG, parabolic_scheme, cache)
     # Reset gradients
@@ -278,8 +271,6 @@ function calc_gradient!(gradients, u_transformed, t,
     return nothing
 end
 
-
-
 # This version is called during `calc_gradients!` and must be specialized because the
 # MAGNITUDE of the flux does NOT depend on the outward normal.
 # Thus, you don't need to scale by 2 (e.g., the scaling factor in the normals (and in the
@@ -287,7 +278,8 @@ end
 # interface in 2D) and flip the sign when storing the mortar fluxes back
 # into `surface_flux_values`.
 @inline function mortar_fluxes_to_elements_gradient!(surface_flux_values,
-                                                     mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                                     mesh::Union{P4estMeshParallel{3},
+                                                                 T8codeMeshParallel{3}},
                                                      equations_parabolic::AbstractEquationsParabolic,
                                                      mortar_l2::LobattoLegendreMortarL2,
                                                      dg::DGSEM, cache, mortar,
@@ -362,7 +354,8 @@ end
 end
 
 function calc_interface_flux_gradient!(surface_flux_values,
-                                      mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                       mesh::Union{P4estMeshParallel{3},
+                                                   T8codeMeshParallel{3}},
                                        equations_parabolic,
                                        dg::DG, parabolic_scheme, cache)
     @unpack neighbor_ids, node_indices = cache.interfaces
@@ -439,7 +432,9 @@ function calc_interface_flux_gradient!(surface_flux_values,
 end
 
 # This version is used for parabolic gradient computations
-@inline function calc_interface_flux_gradient!(surface_flux_values, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+@inline function calc_interface_flux_gradient!(surface_flux_values,
+                                               mesh::Union{P4estMeshParallel{3},
+                                                           T8codeMeshParallel{3}},
                                                equations_parabolic,
                                                dg::DG, parabolic_scheme, cache,
                                                interface_index, normal_direction,
@@ -470,10 +465,9 @@ end
     return nothing
 end
 
-
-
 function calc_mpi_interface_flux_gradient!(surface_flux_values,
-                                           mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                           mesh::Union{P4estMeshParallel{3},
+                                                       T8codeMeshParallel{3}},
                                            equations_parabolic,
                                            dg::DG, parabolic_scheme, cache)
     @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
@@ -516,12 +510,12 @@ function calc_mpi_interface_flux_gradient!(surface_flux_values,
                                                         local_element)
 
                 calc_mpi_interface_flux_gradient!(surface_flux_values, mesh,
-                                  equations_parabolic,
-                                  dg, parabolic_scheme, cache,
-                                  interface, normal_direction,
-                                  i, j, local_side,
-                                  i_surface, j_surface,
-                                  local_direction, local_element)
+                                                  equations_parabolic,
+                                                  dg, parabolic_scheme, cache,
+                                                  interface, normal_direction,
+                                                  i, j, local_side,
+                                                  i_surface, j_surface,
+                                                  local_direction, local_element)
                 # Increment local element indices to pull the normal direction
                 i_element += i_element_step_i
                 j_element += j_element_step_i
@@ -567,7 +561,7 @@ end
 
     for v in eachvariable(equations_parabolic)
         surface_flux_values[v, surface_i_node_index, surface_j_node_index,
-                            local_direction_index, local_element_index] = flux_[v]
+        local_direction_index, local_element_index] = flux_[v]
     end
 
     return nothing
@@ -576,7 +570,8 @@ end
 # This is the version used when calculating the divergence of the viscous fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
 # except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
-function calc_volume_integral!(du, flux_viscous, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+function calc_volume_integral!(du, flux_viscous,
+                               mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
                                equations_parabolic::AbstractEquationsParabolic,
                                dg::DGSEM, cache)
     (; derivative_hat) = dg.basis
@@ -752,7 +747,8 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
 end
 
 # This version is used for divergence flux computations
-function calc_interface_flux!(surface_flux_values, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+function calc_interface_flux!(surface_flux_values,
+                              mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
                               equations_parabolic, dg::DG, parabolic_scheme,
                               cache)
     (; neighbor_ids, node_indices) = cache.interfaces
@@ -843,7 +839,9 @@ function calc_interface_flux!(surface_flux_values, mesh::Union{P4estMeshParallel
 end
 
 function prolong2mortars_divergence!(cache, flux_viscous,
-                                     mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, equations_parabolic,
+                                     mesh::Union{P4estMeshParallel{3},
+                                                 T8codeMeshParallel{3}},
+                                     equations_parabolic,
                                      mortar_l2::LobattoLegendreMortarL2,
                                      dg::DGSEM)
     @unpack neighbor_ids, node_indices = cache.mortars
@@ -973,13 +971,12 @@ function prolong2mortars_divergence!(cache, flux_viscous,
     return nothing
 end
 
-
 function prolong2mpimortars_divergence!(cache, flux_viscous,
-                                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                        mesh::Union{P4estMeshParallel{3},
+                                                    T8codeMeshParallel{3}},
                                         equations_parabolic,
                                         mortar_l2::LobattoLegendreMortarL2,
                                         dg::DGSEM)
-
     @unpack node_indices = cache.mpi_mortars
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
@@ -994,25 +991,24 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
         small_indices = node_indices[1, mortar]
         direction_index = indices2direction(small_indices)
 
-        i_small_start, i_small_step_i, i_small_step_j =
-            index_to_start_step_3d(small_indices[1], index_range)
-        j_small_start, j_small_step_i, j_small_step_j =
-            index_to_start_step_3d(small_indices[2], index_range)
-        k_small_start, k_small_step_i, k_small_step_j =
-            index_to_start_step_3d(small_indices[3], index_range)
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
 
         # Large side indexing
         large_indices = node_indices[2, mortar]
 
-        i_large_start, i_large_step_i, i_large_step_j =
-            index_to_start_step_3d(large_indices[1], index_range)
-        j_large_start, j_large_step_i, j_large_step_j =
-            index_to_start_step_3d(large_indices[2], index_range)
-        k_large_start, k_large_step_i, k_large_step_j =
-            index_to_start_step_3d(large_indices[3], index_range)
+        i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_indices[1],
+                                                                               index_range)
+        j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_indices[2],
+                                                                               index_range)
+        k_large_start, k_large_step_i, k_large_step_j = index_to_start_step_3d(large_indices[3],
+                                                                               index_range)
 
         for (element, position) in zip(local_neighbor_ids, local_neighbor_positions)
-
             if position == 5
                 # =========================
                 # LARGE ELEMENT
@@ -1028,18 +1024,21 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
                     for i in eachnode(dg)
                         normal_direction = get_normal_direction(direction_index,
                                                                 contravariant_vectors,
-                                                                i_large, j_large, k_large,
+                                                                i_large, j_large,
+                                                                k_large,
                                                                 element)
 
                         for v in eachvariable(equations_parabolic)
-                            flux_node = SVector(
-                                flux_viscous_x[v, i_large, j_large, k_large, element],
-                                flux_viscous_y[v, i_large, j_large, k_large, element],
-                                flux_viscous_z[v, i_large, j_large, k_large, element]
-                            )
+                            flux_node = SVector(flux_viscous_x[v, i_large, j_large,
+                                                               k_large, element],
+                                                flux_viscous_y[v, i_large, j_large,
+                                                               k_large, element],
+                                                flux_viscous_z[v, i_large, j_large,
+                                                               k_large, element])
 
                             # same convention as local code
-                            u_buffer[v, i, j] = -0.5f0 * dot(flux_node, normal_direction)
+                            u_buffer[v, i, j] = -0.5f0 *
+                                                dot(flux_node, normal_direction)
                         end
 
                         i_large += i_large_step_i
@@ -1051,19 +1050,23 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
                     k_large += k_large_step_j
                 end
 
-                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 1, :, :, mortar),
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 1, :, :,
+                                             mortar),
                                         mortar_l2.forward_lower,
                                         mortar_l2.forward_lower,
                                         u_buffer, fstar_tmp)
-                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 2, :, :, mortar),
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 2, :, :,
+                                             mortar),
                                         mortar_l2.forward_upper,
                                         mortar_l2.forward_lower,
                                         u_buffer, fstar_tmp)
-                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 3, :, :, mortar),
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 3, :, :,
+                                             mortar),
                                         mortar_l2.forward_lower,
                                         mortar_l2.forward_upper,
                                         u_buffer, fstar_tmp)
-                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 4, :, :, mortar),
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 4, :, :,
+                                             mortar),
                                         mortar_l2.forward_upper,
                                         mortar_l2.forward_upper,
                                         u_buffer, fstar_tmp)
@@ -1080,18 +1083,20 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
                     for i in eachnode(dg)
                         normal_direction = get_normal_direction(direction_index,
                                                                 contravariant_vectors,
-                                                                i_small, j_small, k_small,
+                                                                i_small, j_small,
+                                                                k_small,
                                                                 element)
 
                         for v in eachvariable(equations_parabolic)
-                            flux_node = SVector(
-                                flux_viscous_x[v, i_small, j_small, k_small, element],
-                                flux_viscous_y[v, i_small, j_small, k_small, element],
-                                flux_viscous_z[v, i_small, j_small, k_small, element]
-                            )
-
-                            cache.mpi_mortars.u[1, v, position, i, j, mortar] =
-                                dot(flux_node, normal_direction)
+                            flux_node = SVector(flux_viscous_x[v, i_small, j_small,
+                                                               k_small, element],
+                                                flux_viscous_y[v, i_small, j_small,
+                                                               k_small, element],
+                                                flux_viscous_z[v, i_small, j_small,
+                                                               k_small, element])
+
+                            cache.mpi_mortars.u[1, v, position, i, j, mortar] = dot(flux_node,
+                                                                                    normal_direction)
                         end
 
                         i_small += i_small_step_i
@@ -1110,7 +1115,8 @@ function prolong2mpimortars_divergence!(cache, flux_viscous,
 end
 
 function calc_mortar_flux_divergence!(surface_flux_values,
-                                      mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                      mesh::Union{P4estMeshParallel{3},
+                                                  T8codeMeshParallel{3}},
                                       equations_parabolic,
                                       mortar_l2::LobattoLegendreMortarL2,
                                       dg::DG, parabolic_scheme, cache)
@@ -1186,7 +1192,6 @@ function calc_mortar_flux_divergence!(surface_flux_values,
     return nothing
 end
 
-
 function calc_mpi_interface_flux_divergence!(surface_flux_values,
                                              mesh::Union{P4estMeshParallel{3},
                                                          T8codeMeshParallel{3}},
@@ -1202,22 +1207,22 @@ function calc_mpi_interface_flux_divergence!(surface_flux_values,
         local_direction = indices2direction(local_indices)
         local_side = local_sides[interface]
 
-        i_element_start, i_element_step_i, i_element_step_j =
-            index_to_start_step_3d(local_indices[1], index_range)
-        j_element_start, j_element_step_i, j_element_step_j =
-            index_to_start_step_3d(local_indices[2], index_range)
-        k_element_start, k_element_step_i, k_element_step_j =
-            index_to_start_step_3d(local_indices[3], index_range)
+        i_element_start, i_element_step_i, i_element_step_j = index_to_start_step_3d(local_indices[1],
+                                                                                     index_range)
+        j_element_start, j_element_step_i, j_element_step_j = index_to_start_step_3d(local_indices[2],
+                                                                                     index_range)
+        k_element_start, k_element_step_i, k_element_step_j = index_to_start_step_3d(local_indices[3],
+                                                                                     index_range)
 
         i_element = i_element_start
         j_element = j_element_start
         k_element = k_element_start
 
         local_surface_indices = surface_indices(local_indices)
-        i_surface_start, i_surface_step_i, i_surface_step_j =
-            index_to_start_step_3d(local_surface_indices[1], index_range)
-        j_surface_start, j_surface_step_i, j_surface_step_j =
-            index_to_start_step_3d(local_surface_indices[2], index_range)
+        i_surface_start, i_surface_step_i, i_surface_step_j = index_to_start_step_3d(local_surface_indices[1],
+                                                                                     index_range)
+        j_surface_start, j_surface_step_i, j_surface_step_j = index_to_start_step_3d(local_surface_indices[2],
+                                                                                     index_range)
 
         i_surface = i_surface_start
         j_surface = j_surface_start
@@ -1271,21 +1276,22 @@ end
                                                      local_element_index)
     @unpack u = cache.mpi_interfaces
 
-    viscous_flux_normal_ll, viscous_flux_normal_rr =
-        get_surface_node_vars(u, equations_parabolic, dg,
-                              interface_i_node_index, interface_j_node_index,
-                              interface_index)
+    viscous_flux_normal_ll, viscous_flux_normal_rr = get_surface_node_vars(u,
+                                                                           equations_parabolic,
+                                                                           dg,
+                                                                           interface_i_node_index,
+                                                                           interface_j_node_index,
+                                                                           interface_index)
 
-   
-        flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
-                               normal_direction, Divergence(),
-                               equations_parabolic, parabolic_scheme)
-   
-    dirFactor = (local_side==1) ? 1 : -1
+    flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                           normal_direction, Divergence(),
+                           equations_parabolic, parabolic_scheme)
+
+    dirFactor = (local_side == 1) ? 1 : -1
 
     for v in eachvariable(equations_parabolic)
         surface_flux_values[v, surface_i_node_index, surface_j_node_index,
-                            local_direction_index, local_element_index] = dirFactor .* flux_[v]
+        local_direction_index, local_element_index] = dirFactor .* flux_[v]
     end
 
     return nothing
@@ -1340,8 +1346,10 @@ end
     @unpack u = cache.mpi_mortars
 
     for v in eachvariable(equations_parabolic)
-        viscous_flux_normal_ll = u[1, v, position_index, i_node_index, j_node_index, mortar_index]
-        viscous_flux_normal_rr = u[2, v, position_index, i_node_index, j_node_index, mortar_index]
+        viscous_flux_normal_ll = u[1, v, position_index, i_node_index, j_node_index,
+                                   mortar_index]
+        viscous_flux_normal_rr = u[2, v, position_index, i_node_index, j_node_index,
+                                   mortar_index]
 
         flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
                                normal_direction, Divergence(),
@@ -1363,14 +1371,16 @@ end
     @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
 
     mpi_mortar_fluxes_to_elements!(surface_flux_values, mesh,
-                               equations_parabolic, mortar_l2, dg, cache,
-                               mortar, fstar, fstar, u_buffer, fstar_tmp)
+                                   equations_parabolic, mortar_l2, dg, cache,
+                                   mortar, fstar, fstar, u_buffer, fstar_tmp)
 
     return nothing
 end
 
 function calc_mortar_flux_gradient!(surface_flux_values,
-                                    mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, equations_parabolic,
+                                    mesh::Union{P4estMeshParallel{3},
+                                                T8codeMeshParallel{3}},
+                                    equations_parabolic,
                                     mortar_l2::LobattoLegendreMortarL2,
                                     dg::DG, parabolic_scheme, cache)
     @unpack neighbor_ids, node_indices = cache.mortars
@@ -1448,7 +1458,8 @@ end
 # terms (e.g., we compute a viscous conservative "flux") and thus no
 # non-conservative terms are present.
 @inline function calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
-                                            mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                            mesh::Union{P4estMeshParallel{3},
+                                                        T8codeMeshParallel{3}},
                                             equations_parabolic,
                                             dg::DG, parabolic_scheme, cache,
                                             mortar_index, position_index,
@@ -1472,9 +1483,9 @@ end
     return nothing
 end
 
-
 function calc_mpi_mortar_flux_gradient!(surface_flux_values,
-                                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                        mesh::Union{P4estMeshParallel{3},
+                                                    T8codeMeshParallel{3}},
                                         equations_parabolic,
                                         mortar_l2::LobattoLegendreMortarL2,
                                         dg::DG, parabolic_scheme, cache)
@@ -1553,10 +1564,10 @@ end
     large_direction = indices2direction(large_indices)
     large_surface_indices = surface_indices(large_indices)
 
-    i_large_start, i_large_step_i, i_large_step_j =
-        index_to_start_step_3d(large_surface_indices[1], index_range)
-    j_large_start, j_large_step_i, j_large_step_j =
-        index_to_start_step_3d(large_surface_indices[2], index_range)
+    i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_surface_indices[1],
+                                                                           index_range)
+    j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_surface_indices[2],
+                                                                           index_range)
 
     for (element, position) in zip(local_neighbor_ids[mortar],
                                    local_neighbor_positions[mortar])
@@ -1590,7 +1601,7 @@ end
                 for i in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
                         surface_flux_values[v, i_large, j_large,
-                                            large_direction, element] = u_buffer[v, i, j]
+                        large_direction, element] = u_buffer[v, i, j]
                     end
                     i_large += i_large_step_i
                     j_large += j_large_step_i
@@ -1602,8 +1613,10 @@ end
             for j in eachnode(dg)
                 for i in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        surface_flux_values[v, i, j, small_direction, element] =
-                            fstar_primary[v, i, j, position]
+                        surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                               i,
+                                                                                               j,
+                                                                                               position]
                     end
                 end
             end
@@ -1613,12 +1626,9 @@ end
     return nothing
 end
 
-
-
-
-
 function calc_volume_integral_gradient!(gradients, u_transformed,
-                                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, # for dispatch only
+                                        mesh::Union{P4estMeshParallel{3},
+                                                    T8codeMeshParallel{3}}, # for dispatch only
                                         equations_parabolic::AbstractEquationsParabolic,
                                         dg::DG, cache)
     @unpack derivative_hat = dg.basis
@@ -1758,7 +1768,8 @@ end
 function calc_boundary_flux!(cache, t,
                              boundary_condition_parabolic, # works with Dict types
                              boundary_condition_indices,
-                             operator_type, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             operator_type,
+                             mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
                              equations_parabolic::AbstractEquationsParabolic,
                              surface_integral, dg::DG)
     (; boundaries) = cache
@@ -1831,7 +1842,8 @@ function calc_boundary_flux!(cache, t,
 end
 
 function calc_surface_integral_gradient!(gradients,
-                                         mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, # for dispatch only
+                                         mesh::Union{P4estMeshParallel{3},
+                                                     T8codeMeshParallel{3}}, # for dispatch only
                                          equations_parabolic::AbstractEquationsParabolic,
                                          dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
@@ -1931,10 +1943,9 @@ function calc_surface_integral_gradient!(gradients,
     return nothing
 end
 
-
-
 function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
-                                mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                                mesh::Union{P4estMeshParallel{3},
+                                            T8codeMeshParallel{3}},
                                 equations_parabolic, dg::DG)
     @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
     @unpack contravariant_vectors = cache.elements
@@ -1947,11 +1958,14 @@ function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
         local_indices = node_indices[interface]
         local_direction = indices2direction(local_indices)
         local_side = local_sides[interface]
-        orientationFactor = local_side==1 ? 1 : -1
+        orientationFactor = local_side == 1 ? 1 : -1
 
-        i_start, i_step_i, i_step_j = index_to_start_step_3d(local_indices[1], index_range)
-        j_start, j_step_i, j_step_j = index_to_start_step_3d(local_indices[2], index_range)
-        k_start, k_step_i, k_step_j = index_to_start_step_3d(local_indices[3], index_range)
+        i_start, i_step_i, i_step_j = index_to_start_step_3d(local_indices[1],
+                                                             index_range)
+        j_start, j_step_i, j_step_j = index_to_start_step_3d(local_indices[2],
+                                                             index_range)
+        k_start, k_step_i, k_step_j = index_to_start_step_3d(local_indices[3],
+                                                             index_range)
 
         i_elem = i_start
         j_elem = j_start
@@ -1965,13 +1979,16 @@ function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
                                                         local_element)
 
                 for v in eachvariable(equations_parabolic)
-                    flux_viscous = SVector(
-                        flux_viscous_x[v, i_elem, j_elem, k_elem, local_element],
-                        flux_viscous_y[v, i_elem, j_elem, k_elem, local_element],
-                        flux_viscous_z[v, i_elem, j_elem, k_elem, local_element]
-                    )
-
-                    cache.mpi_interfaces.u[local_side, v, i, j, interface] = orientationFactor .* dot(flux_viscous, normal_direction)
+                    flux_viscous = SVector(flux_viscous_x[v, i_elem, j_elem, k_elem,
+                                                          local_element],
+                                           flux_viscous_y[v, i_elem, j_elem, k_elem,
+                                                          local_element],
+                                           flux_viscous_z[v, i_elem, j_elem, k_elem,
+                                                          local_element])
+
+                    cache.mpi_interfaces.u[local_side, v, i, j, interface] = orientationFactor .*
+                                                                             dot(flux_viscous,
+                                                                                 normal_direction)
                 end
 
                 i_elem += i_step_i
@@ -1988,12 +2005,13 @@ function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
     return nothing
 end
 
-
 # Needed to *not* flip the sign of the inverse Jacobian.
 # This is because the parabolic fluxes are assumed to be of the form
 #   `du/dt + df/dx = dg/dx + source(x,t)`,
 # where f(u) is the inviscid flux and g(u) is the viscous flux.
-function apply_jacobian_parabolic!(du::AbstractArray, mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+function apply_jacobian_parabolic!(du::AbstractArray,
+                                   mesh::Union{P4estMeshParallel{3},
+                                               T8codeMeshParallel{3}},
                                    equations_parabolic::AbstractEquationsParabolic,
                                    dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index 636292bc41a..372f3cc81ea 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -266,7 +266,7 @@ end
 
 function create_cache_parabolic(mesh::P4estMeshParallel,
                                 equations_hyperbolic::AbstractEquations,
-                                dg::DG, n_elements, uEltype ) 
+                                dg::DG, n_elements, uEltype)
 
     # Keep mesh state consistent, just like the hyperbolic parallel cache
     balance!(mesh)
@@ -274,11 +274,10 @@ function create_cache_parabolic(mesh::P4estMeshParallel,
     update_ghost_layer!(mesh)
 
     # Local parabolic containers
-    elements   = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
+    elements = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
     interfaces = init_interfaces(mesh, equations_hyperbolic, dg.basis, elements)
     boundaries = init_boundaries(mesh, equations_hyperbolic, dg.basis, elements)
 
-
     # --- Minimal viscous container (ONLY what is already used downstream)
     viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
                                                   nnodes(dg), n_elements,
diff --git a/test/test_mpi_p4est_3d.jl b/test/test_mpi_p4est_3d.jl
index 77dd0af71cd..7da623d94b1 100644
--- a/test/test_mpi_p4est_3d.jl
+++ b/test/test_mpi_p4est_3d.jl
@@ -251,11 +251,9 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 
-
-
     @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl" begin
-        @test_trixi_include(joinpath(EXAMPLES_DIR, 
-                                    "elixir_navierstokes_taylor_green_vortex.jl"),
+        @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                     "elixir_navierstokes_taylor_green_vortex.jl"),
                             initial_refinement_level=2, tspan=(0.0, 0.25),
                             surface_flux=FluxHLL(min_max_speed_naive),
                             l2=[
@@ -278,16 +276,15 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
         @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
     end
 
-
     @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl (Diffusive CFL)" begin
-        @test_trixi_include(joinpath(EXAMPLES_DIR, 
-                                    "elixir_navierstokes_taylor_green_vortex.jl"),
+        @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                     "elixir_navierstokes_taylor_green_vortex.jl"),
                             tspan=(0.0, 0.1),
                             mu=0.5, # render flow diffusion-dominated
                             callbacks=CallbackSet(summary_callback, analysis_callback,
-                                                alive_callback,
-                                                StepsizeCallback(cfl = 2.3,
-                                                                cfl_diffusive = 0.4)),
+                                                  alive_callback,
+                                                  StepsizeCallback(cfl = 2.3,
+                                                                   cfl_diffusive = 0.4)),
                             adaptive=false, # respect CFL
                             ode_alg=CKLLSRK95_4S(),
                             l2=[
@@ -309,11 +306,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
         @test_allocations(Trixi.rhs!, semi, sol, 1000)
         @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
     end
-
-
-
 end # 3D Testcases 
-
 end # P4estMesh MPI
 
 end # module
diff --git a/test/test_parabolic_3d.jl b/test/test_parabolic_3d.jl
index f7955f7cff8..492981f3561 100644
--- a/test/test_parabolic_3d.jl
+++ b/test/test_parabolic_3d.jl
@@ -440,7 +440,7 @@ end
 end
 
 @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex_amr_mortar.jl" begin
-    @test_trixi_include(joinpath(EXAMPLES_DIR,"p4est_3d_dgsem",
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
                                  "elixir_navierstokes_taylor_green_vortex_amr_mortar.jl"),
                         tspan=(0.0, 5.0),
                         l2=[
@@ -460,7 +460,7 @@ end
 
     @test isapprox(last(summary_callback.analysis_integrals[:enstrophy]),
                    1.6598233348361535;
-                   rtol=1.0e-12, atol=1.0e-14)
+                   rtol = 1.0e-12, atol = 1.0e-14)
 
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)

From 891b71a96846478cf1ac09f5dcce8c76c73b480f Mon Sep 17 00:00:00 2001
From: TJP-Karpowski <MessageInGitHubPlease>
Date: Mon, 23 Mar 2026 22:53:10 +0100
Subject: [PATCH 7/7] corrected unnecessary change in semidiscretize hyp-para

---
 .../semidiscretization_hyperbolic_parabolic.jl                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index b444c0d7cd7..3287f8d602d 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -367,7 +367,7 @@ function semidiscretize(semi::SemidiscretizationHyperbolicParabolic, tspan,
         # We could also construct an `ODEFunction` explicitly without the Jacobian here,
         # but we stick to the lean direct in-place function `rhs_parabolic!` and
         # let OrdinaryDiffEq.jl handle the rest
-        return SplitODEProblem{iip}(x, rhs!, u0_ode, tspan, semi)
+        return SplitODEProblem{iip}(rhs_parabolic!, rhs!, u0_ode, tspan, semi)
     end
 end