diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
new file mode 100644
index 00000000000..0196a772d9b
--- /dev/null
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_taylor_green_vortex_amr_mortar.jl
@@ -0,0 +1,129 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+# Why: This testcases adds uneven refinment in the taylor green vortex (TGV) to specifically test (MPI) mortar treatment in the parabolic part.
+
+###############################################################################
+# Physics
+
+prandtl_number() = 0.72
+mu = 6.25e-4  # Re ≈ 1600
+
+equations = CompressibleEulerEquations3D(1.4)
+equations_parabolic = CompressibleNavierStokesDiffusion3D(equations, mu = mu,
+                                                          Prandtl = prandtl_number())
+
+###############################################################################
+# Initial condition (Taylor-Green vortex)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A = 1.0
+    Ms = 0.1
+
+    rho = 1.0
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+
+    p = (A / Ms)^2 * rho / equations.gamma
+    p += 1.0 / 16.0 * A^2 * rho *
+         (cos(2x[1]) * cos(2x[3]) +
+          2cos(2x[2]) + 2cos(2x[1]) +
+          cos(2x[2]) * cos(2x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+###############################################################################
+# DG setup
+
+volume_flux = flux_ranocha
+
+solver = DGSEM(polydeg = 3,
+               surface_flux = flux_lax_friedrichs,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+###############################################################################
+# Mesh
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
+
+trees_per_dimension = (2, 2, 2)
+
+mesh = P4estMesh(trees_per_dimension,
+                 polydeg = 3,
+                 coordinates_min = coordinates_min,
+                 coordinates_max = coordinates_max,
+                 periodicity = (true, true, true),
+                 initial_refinement_level = 0)
+
+###############################################################################
+# Semidiscretization
+
+semi = SemidiscretizationHyperbolicParabolic(mesh,
+                                             (equations, equations_parabolic),
+                                             initial_condition,
+                                             solver;
+                                             boundary_conditions = (boundary_condition_periodic,
+                                                                    boundary_condition_periodic))
+
+###############################################################################
+# AMR: positional rule → creates 2 refinment bocks on the diagonal of the cube 
+# Used to explicitly include mortars in the parabolic MPI testscases
+
+function positional_rule(x, t)
+    if ((x[1] < 0) && (x[2] < 0) && (x[3] < 0)) ||
+       ((x[1] > 0) && (x[2] > 0) && (x[3] > 0))
+        return 1.0   # refine
+    else
+        return 0.0   # keep coarse
+    end
+end
+
+amr_indicator = IndicatorPositional(positional_rule, semi)
+
+amr_controller = ControllerThreeLevel(semi, amr_indicator;
+                                      base_level = 0,
+                                      med_level = 1, med_threshold = 0.5,
+                                      max_level = 2, max_threshold = 0.9)
+
+# refine only once to get a static mortar mesh
+amr_callback = AMRCallback(semi,
+                           amr_controller;
+                           interval = typemax(Int),                 # no further AMR
+                           adapt_initial_condition = true,
+                           adapt_initial_condition_only_refine = true)
+
+###############################################################################
+# Callbacks 
+analysis_interval = 50
+analysis_callback = AnalysisCallback(semi,
+                                     interval = analysis_interval,  # effectively disabled for short runs
+                                     save_analysis = true,
+                                     extra_analysis_integrals = (energy_kinetic,
+                                                                 energy_internal,
+                                                                 enstrophy))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+callbacks = CallbackSet(analysis_callback,
+                        alive_callback,
+                        amr_callback)
+
+###############################################################################
+# Time integration
+
+tspan = (0.0, 5)
+
+ode = semidiscretize(semi, tspan)
+
+sol = solve(ode,
+            RDPK3SpFSAL49();
+            abstol = 1e-8,
+            reltol = 1e-8,
+            ode_default_options()...,
+            callback = callbacks)
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 167e01189c8..cf8c05b2e72 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -321,7 +321,7 @@ export load_mesh, load_time, load_timestep, load_timestep!, load_dt,
        load_adaptive_time_integrator!
 
 export ControllerThreeLevel, ControllerThreeLevelCombined,
-       IndicatorLöhner, IndicatorLoehner, IndicatorMax
+       IndicatorLöhner, IndicatorLoehner, IndicatorMax, IndicatorPositional
 
 export PositivityPreservingLimiterZhangShu, EntropyBoundedLimiter
 
diff --git a/src/callbacks_step/amr.jl b/src/callbacks_step/amr.jl
index 5d28cdacf10..fdc9e5a0eb0 100644
--- a/src/callbacks_step/amr.jl
+++ b/src/callbacks_step/amr.jl
@@ -633,6 +633,46 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::P4estMesh,
                 partition!(mesh)
                 rebalance_solver!(u_ode, mesh, equations, dg, cache,
                                   old_global_first_quadrant)
+
+                # Resync parabolic cache after repartitioning
+                @unpack viscous_container = cache_parabolic
+                resize!(viscous_container, equations, dg, cache)
+
+                if hasproperty(cache_parabolic, :elements)
+                    nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
+                    resize!(cache_parabolic.elements, nelements)
+                    copyto!(cache_parabolic.elements.node_coordinates,
+                            cache.elements.node_coordinates)
+                    copyto!(cache_parabolic.elements.jacobian_matrix,
+                            cache.elements.jacobian_matrix)
+                    copyto!(cache_parabolic.elements.contravariant_vectors,
+                            cache.elements.contravariant_vectors)
+                    copyto!(cache_parabolic.elements.inverse_jacobian,
+                            cache.elements.inverse_jacobian)
+                    copyto!(cache_parabolic.elements.surface_flux_values,
+                            cache.elements.surface_flux_values)
+                end
+
+                if hasproperty(cache_parabolic, :interfaces)
+                    ninterfaces = size(cache.interfaces.neighbor_ids, 2)
+                    resize!(cache_parabolic.interfaces, ninterfaces)
+                    copyto!(cache_parabolic.interfaces.u, cache.interfaces.u)
+                    copyto!(cache_parabolic.interfaces.neighbor_ids,
+                            cache.interfaces.neighbor_ids)
+                    copyto!(cache_parabolic.interfaces.node_indices,
+                            cache.interfaces.node_indices)
+                end
+
+                if hasproperty(cache_parabolic, :boundaries)
+                    nboundaries = length(cache.boundaries.neighbor_ids)
+                    resize!(cache_parabolic.boundaries, nboundaries)
+                    copyto!(cache_parabolic.boundaries.u, cache.boundaries.u)
+                    copyto!(cache_parabolic.boundaries.neighbor_ids,
+                            cache.boundaries.neighbor_ids)
+                    copyto!(cache_parabolic.boundaries.node_indices,
+                            cache.boundaries.node_indices)
+                    copyto!(cache_parabolic.boundaries.name, cache.boundaries.name)
+                end
             end
         end
 
diff --git a/src/callbacks_step/amr_dg2d.jl b/src/callbacks_step/amr_dg2d.jl
index 2141c2713bc..6795a752063 100644
--- a/src/callbacks_step/amr_dg2d.jl
+++ b/src/callbacks_step/amr_dg2d.jl
@@ -193,6 +193,37 @@ function refine!(u_ode::AbstractVector, adaptor,
     @unpack viscous_container = cache_parabolic
     resize!(viscous_container, equations, dg, cache)
 
+    # Keep parabolic topology containers in sync if they exist (currently only for P4estMesh thus the conditional
+    if hasproperty(cache_parabolic, :elements)
+        nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
+        resize!(cache_parabolic.elements, nelements)
+
+        copyto!(cache_parabolic.elements.node_coordinates,
+                cache.elements.node_coordinates)
+        copyto!(cache_parabolic.elements.jacobian_matrix,
+                cache.elements.jacobian_matrix)
+        copyto!(cache_parabolic.elements.contravariant_vectors,
+                cache.elements.contravariant_vectors)
+        copyto!(cache_parabolic.elements.inverse_jacobian,
+                cache.elements.inverse_jacobian)
+        copyto!(cache_parabolic.elements.surface_flux_values,
+                cache.elements.surface_flux_values)
+    end
+    if hasproperty(cache_parabolic, :interfaces)
+        resize!(cache_parabolic.interfaces, size(cache.interfaces.neighbor_ids, 2))
+        copyto!(cache_parabolic.interfaces.u, cache.interfaces.u)
+        copyto!(cache_parabolic.interfaces.neighbor_ids, cache.interfaces.neighbor_ids)
+        copyto!(cache_parabolic.interfaces.node_indices, cache.interfaces.node_indices)
+    end
+
+    if hasproperty(cache_parabolic, :boundaries)
+        resize!(cache_parabolic.boundaries, length(cache.boundaries.neighbor_ids))
+        copyto!(cache_parabolic.boundaries.u, cache.boundaries.u)
+        copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
+        copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
+        copyto!(cache_parabolic.boundaries.name, cache.boundaries.name)
+    end
+
     return nothing
 end
 
@@ -387,6 +418,36 @@ function coarsen!(u_ode::AbstractVector, adaptor,
     # Resize parabolic helper variables
     @unpack viscous_container = cache_parabolic
     resize!(viscous_container, equations, dg, cache)
+    # Keep parabolic topology containers in sync if they exist (currently only for P4estMesh thus the conditional
+    if hasproperty(cache_parabolic, :elements)
+        nelements = size(cache.elements.inverse_jacobian, ndims(mesh) + 1)
+        resize!(cache_parabolic.elements, nelements)
+
+        copyto!(cache_parabolic.elements.node_coordinates,
+                cache.elements.node_coordinates)
+        copyto!(cache_parabolic.elements.jacobian_matrix,
+                cache.elements.jacobian_matrix)
+        copyto!(cache_parabolic.elements.contravariant_vectors,
+                cache.elements.contravariant_vectors)
+        copyto!(cache_parabolic.elements.inverse_jacobian,
+                cache.elements.inverse_jacobian)
+        copyto!(cache_parabolic.elements.surface_flux_values,
+                cache.elements.surface_flux_values)
+    end
+    if hasproperty(cache_parabolic, :interfaces)
+        resize!(cache_parabolic.interfaces, size(cache.interfaces.neighbor_ids, 2))
+        copyto!(cache_parabolic.interfaces.u, cache.interfaces.u)
+        copyto!(cache_parabolic.interfaces.neighbor_ids, cache.interfaces.neighbor_ids)
+        copyto!(cache_parabolic.interfaces.node_indices, cache.interfaces.node_indices)
+    end
+
+    if hasproperty(cache_parabolic, :boundaries)
+        resize!(cache_parabolic.boundaries, length(cache.boundaries.neighbor_ids))
+        copyto!(cache_parabolic.boundaries.u, cache.boundaries.u)
+        copyto!(cache_parabolic.boundaries.neighbor_ids, cache.boundaries.neighbor_ids)
+        copyto!(cache_parabolic.boundaries.node_indices, cache.boundaries.node_indices)
+        copyto!(cache_parabolic.boundaries.name, cache.boundaries.name)
+    end
 
     return nothing
 end
diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index c730439017c..3287f8d602d 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -91,6 +91,7 @@ function SemidiscretizationHyperbolicParabolic(mesh, equations::Tuple,
 
     cache_parabolic = create_cache_parabolic(mesh, equations, solver,
                                              nelements(solver, cache), uEltype)
+    cache_parabolic = (; cache..., cache_parabolic...)
 
     _boundary_conditions_parabolic = digest_boundary_conditions(boundary_conditions_parabolic,
                                                                 mesh, solver, cache)
@@ -173,6 +174,12 @@ function Base.show(io::IO, semi::SemidiscretizationHyperbolicParabolic)
         print(io, key)
     end
     print(io, "))")
+    print(io, ", cacheParabolic(")
+    for (idx, key) in enumerate(keys(semi.cache_parabolic))
+        idx > 1 && print(io, " ")
+        print(io, key)
+    end
+    print(io, "))")
     return nothing
 end
 
diff --git a/src/solvers/dgsem/indicators.jl b/src/solvers/dgsem/indicators.jl
index db1349f854d..43c73bdc3f2 100644
--- a/src/solvers/dgsem/indicators.jl
+++ b/src/solvers/dgsem/indicators.jl
@@ -489,4 +489,44 @@ function Base.show(io::IO, ::MIME"text/plain",
     end
     return nothing
 end
+
+"""
+    IndicatorPositional(f)
+
+Create an AMR indicator from a positional rule `f(x, t)`. The usecase for this rule is explict testing and or targeted "static" refinment 
+based on a functional rule. The function `f` is called with the element center `x::SVector{ndims,Float64}` and time `t`.
+"""
+struct IndicatorPositional{F, Cache} <: AbstractIndicator
+    rule::F
+    cache::Cache
+end
+
+# this method is used when the indicator is constructed as for AMR
+function IndicatorPositional(rule, semi::AbstractSemidiscretization)
+    cache = create_cache(IndicatorPositional, semi)
+    return IndicatorPositional{typeof(rule), typeof(cache)}(rule, cache)
+end
+
+function (positional::IndicatorPositional)(u, mesh, equations, dg, cache; t = 0.0,
+                                           iter = 0)
+    x = cache.elements.node_coordinates
+    @unpack alpha, center_threaded = positional.cache
+    resize!(alpha, nelements(dg, cache))
+    #### @threaded does not work with cfunction (positional.rule) thus Threads.@threads for now
+    Threads.@threads for element in Trixi.eachelement(dg, cache)
+        center = center_threaded[Threads.threadid()]
+        fill!(center, 0.0)
+        n = 0
+
+        for k in Trixi.eachnode(dg), j in Trixi.eachnode(dg), i in Trixi.eachnode(dg)
+            center[1] += x[1, i, j, k, element]
+            center[2] += x[2, i, j, k, element]
+            center[3] += x[3, i, j, k, element]
+            n += 1
+        end
+        center ./= n
+        alpha[element] = positional.rule(center, t)
+    end
+    return alpha
+end
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg.jl b/src/solvers/dgsem_p4est/dg.jl
index 12e13f0d239..a74f2a9d4d6 100644
--- a/src/solvers/dgsem_p4est/dg.jl
+++ b/src/solvers/dgsem_p4est/dg.jl
@@ -68,6 +68,52 @@ function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG,
     return cache
 end
 
+function create_cache_parabolic(mesh::P4estMesh,
+                                equations_hyperbolic::AbstractEquations,
+                                dg::DG, n_elements, uEltype)
+    # Keep consistent with the baseline p4est cache
+    balance!(mesh)
+
+    elements = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
+    interfaces = init_interfaces(mesh, equations_hyperbolic, dg.basis, elements)
+    boundaries = init_boundaries(mesh, equations_hyperbolic, dg.basis, elements)
+    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
+                                                  nnodes(dg), n_elements,
+                                                  uEltype)
+
+    return (; elements, interfaces, boundaries, viscous_container)
+end
+
+function create_cache_parabolic(mesh::P4estMeshView,
+                                equations_hyperbolic::AbstractEquations,
+                                dg::DG, n_elements, uEltype)
+    balance!(mesh.parent)
+
+    elements_parent = init_elements(mesh.parent, equations_hyperbolic, dg.basis,
+                                    uEltype)
+    interfaces_parent = init_interfaces(mesh.parent, equations_hyperbolic, dg.basis,
+                                        elements_parent)
+    boundaries_parent = init_boundaries(mesh.parent, equations_hyperbolic, dg.basis,
+                                        elements_parent)
+    mortars_parent = init_mortars(mesh.parent, equations_hyperbolic, dg.basis,
+                                  elements_parent)
+
+    elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
+                                                                                             interfaces_parent,
+                                                                                             boundaries_parent,
+                                                                                             mortars_parent,
+                                                                                             mesh,
+                                                                                             equations_hyperbolic,
+                                                                                             dg,
+                                                                                             uEltype)
+
+    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
+                                                  nnodes(dg), n_elements,
+                                                  uEltype)
+
+    return (; elements, interfaces, boundaries, neighbor_ids_parent, viscous_container)
+end
+
 # Extract outward-pointing normal direction
 # (contravariant vector ±Ja^i, i = index)
 # Note that this vector is not normalized
@@ -91,6 +137,7 @@ include("dg_2d_parabolic.jl")
 
 include("dg_3d.jl")
 include("dg_3d_parabolic.jl")
+include("dg_3d_parabolic_parallel.jl")
 include("dg_parallel.jl")
 
 # Subcell limiters
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 35372eecea7..50ff5893843 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -36,20 +36,20 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Convert conservative variables to a form more suitable for viscous flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
-                             dg, cache)
+                             dg, cache_parabolic)
     end
 
     # Compute the gradients of the transformed variables
     @trixi_timeit timer() "calculate gradient" begin
         calc_gradient!(gradients, u_transformed, t, mesh,
                        equations_parabolic, boundary_conditions_parabolic,
-                       dg, parabolic_scheme, cache)
+                       dg, parabolic_scheme, cache_parabolic)
     end
 
     # Compute and store the viscous fluxes
     @trixi_timeit timer() "calculate viscous fluxes" begin
         calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
-                             equations_parabolic, dg, cache)
+                             equations_parabolic, dg, cache_parabolic)
     end
 
     # The remainder of this function is essentially a regular rhs! for parabolic
@@ -65,41 +65,44 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # need to interpolate solutions *and* gradients to the surfaces.
 
     # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
+    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache_parabolic)
 
     # Calculate volume integral
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache)
+        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg,
+                              cache_parabolic)
     end
 
     # Prolong solution to interfaces.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache_parabolic, flux_viscous, mesh, equations_parabolic,
+                            dg)
     end
 
     # Calculate interface fluxes
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                             equations_parabolic, dg, parabolic_scheme, cache)
+        calc_interface_flux!(cache_parabolic.elements.surface_flux_values, mesh,
+                             equations_parabolic, dg, parabolic_scheme, cache_parabolic)
     end
 
     # Prolong viscous fluxes to boundaries.
     # This calls the specialized version for the viscous fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache_parabolic, flux_viscous, mesh, equations_parabolic,
+                            dg)
     end
 
     # Calculate boundary fluxes.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux_divergence!(cache, t,
+        calc_boundary_flux_divergence!(cache_parabolic, t,
                                        boundary_conditions_parabolic, mesh,
                                        equations_parabolic,
                                        dg.surface_integral, dg)
@@ -108,33 +111,34 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Prolong solution to mortars.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars_divergence!(cache, flux_viscous, mesh, equations_parabolic,
+        prolong2mortars_divergence!(cache_parabolic, flux_viscous, mesh,
+                                    equations_parabolic,
                                     dg.mortar, dg)
     end
 
     # Calculate mortar fluxes.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "mortar flux" begin
-        calc_mortar_flux_divergence!(cache.elements.surface_flux_values,
+        calc_mortar_flux_divergence!(cache_parabolic.elements.surface_flux_values,
                                      mesh, equations_parabolic, dg.mortar,
-                                     dg, parabolic_scheme, cache)
+                                     dg, parabolic_scheme, cache_parabolic)
     end
 
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
         calc_surface_integral!(du, u, mesh, equations_parabolic,
-                               dg.surface_integral, dg, cache)
+                               dg.surface_integral, dg, cache_parabolic)
     end
 
     # Apply Jacobian from mapping to reference element
     @trixi_timeit timer() "Jacobian" begin
-        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache)
+        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache_parabolic)
     end
 
     @trixi_timeit timer() "source terms parabolic" begin
         calc_sources_parabolic!(du, u, gradients, t, source_terms_parabolic,
-                                equations_parabolic, dg, cache)
+                                equations_parabolic, dg, cache_parabolic)
     end
 
     return nothing
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
new file mode 100644
index 00000000000..0b57ccc560f
--- /dev/null
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic_parallel.jl
@@ -0,0 +1,2031 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function rhs_parabolic!(du, u, t,
+                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                        equations_parabolic::AbstractEquationsParabolic,
+                        boundary_conditions_parabolic, source_terms_parabolic,
+                        dg::DG, parabolic_scheme, cache, cache_parabolic)
+    @unpack viscous_container = cache_parabolic
+    @unpack u_transformed, gradients, flux_viscous = viscous_container
+
+    cache_p = (; cache...,
+               elements = cache_parabolic.elements,
+               interfaces = cache_parabolic.interfaces,
+               boundaries = cache_parabolic.boundaries)
+
+    # Stage 0: local variable transform
+    #
+    @trixi_timeit timer() "transform variables" begin
+        transform_variables!(u_transformed, u, mesh, equations_parabolic,
+                             dg, cache_p)
+    end
+
+    #
+    # Stage 1: gradient computation
+    #
+
+    # Start gradient-stage MPI receive
+    @trixi_timeit timer() "start MPI receive gradient" begin
+        start_mpi_receive!(cache_p.mpi_cache)
+    end
+
+    # Prolong transformed variables to MPI mortars
+    @trixi_timeit timer() "prolong2mpimortars gradient" begin
+        prolong2mpimortars!(cache_p, u_transformed, mesh, equations_parabolic,
+                            dg.mortar, dg)
+    end
+
+    # Prolong transformed variables to MPI interfaces
+    @trixi_timeit timer() "prolong2mpiinterfaces gradient" begin
+        prolong2mpiinterfaces!(cache_p, u_transformed, mesh, equations_parabolic,
+                               dg.surface_integral, dg)
+    end
+
+    # Start gradient-stage MPI send
+    @trixi_timeit timer() "start MPI send gradient" begin
+        start_mpi_send!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # Local gradient computation
+    @trixi_timeit timer() "calculate gradient local" begin
+        calc_gradient!(gradients, u_transformed, t, mesh,
+                       equations_parabolic, boundary_conditions_parabolic,
+                       dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish gradient-stage MPI receive
+    @trixi_timeit timer() "finish MPI receive gradient" begin
+        finish_mpi_receive!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+    # Finish gradient-stage MPI send
+    @trixi_timeit timer() "finish MPI send gradient" begin
+        finish_mpi_send!(cache_p.mpi_cache)
+    end
+    # MPI interface fluxes for gradient stage
+    @trixi_timeit timer() "MPI interface flux gradient" begin
+        calc_mpi_interface_flux_gradient!(cache_p.elements.surface_flux_values,
+                                          mesh, equations_parabolic,
+                                          dg, parabolic_scheme, cache_p)
+    end
+
+    # MPI mortar fluxes for gradient stage
+    @trixi_timeit timer() "MPI mortar flux gradient" begin
+        calc_mpi_mortar_flux_gradient!(cache_p.elements.surface_flux_values,
+                                       mesh, equations_parabolic, dg.mortar,
+                                       dg, parabolic_scheme, cache_p)
+    end
+
+    # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral_gradient!(gradients, mesh, equations_parabolic,
+                                        dg, cache_p)
+    end
+
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" begin
+        apply_jacobian_parabolic!(gradients, mesh, equations_parabolic, dg,
+                                  cache_p)
+    end
+    # Stage 2: local viscous flux construction
+    #
+    @trixi_timeit timer() "calculate viscous fluxes" begin
+        calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
+                             equations_parabolic, dg, cache_p)
+    end
+
+    #
+    # Stage 3: divergence of viscous fluxes
+    #
+
+    # Start divergence-stage MPI receive
+    @trixi_timeit timer() "start MPI receive divergence" begin
+        start_mpi_receive!(cache_p.mpi_cache)
+    end
+
+    # Reset du
+    @trixi_timeit timer() "reset ∂u/∂t" begin
+        set_zero!(du, dg, cache_p)
+    end
+
+    # Local volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # Prolong viscous fluxes to MPI mortars
+    @trixi_timeit timer() "prolong2mpimortars divergence" begin
+        prolong2mpimortars_divergence!(cache_p, flux_viscous, mesh, equations_parabolic,
+                                       dg.mortar, dg)
+    end
+
+    # Prolong viscous fluxes to MPI interfaces
+    @trixi_timeit timer() "prolong2mpiinterfaces divergence" begin
+        prolong2mpiinterfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+    end
+    ########################## Divergence #################################
+    # Start divergence-stage MPI send
+    @trixi_timeit timer() "start MPI send divergence" begin
+        start_mpi_send!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # Local interface fluxes
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+    end
+
+    @trixi_timeit timer() "interface flux" begin
+        calc_interface_flux!(cache_p.elements.surface_flux_values, mesh,
+                             equations_parabolic, dg, parabolic_scheme, cache_p)
+    end
+
+    # Local boundary fluxes
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache_p, flux_viscous, mesh, equations_parabolic, dg)
+    end
+
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux_divergence!(cache_p, t,
+                                       boundary_conditions_parabolic, mesh,
+                                       equations_parabolic,
+                                       dg.surface_integral, dg)
+    end
+
+    # Local mortar fluxes
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars_divergence!(cache_p, flux_viscous, mesh, equations_parabolic,
+                                    dg.mortar, dg)
+    end
+
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux_divergence!(cache_p.elements.surface_flux_values,
+                                     mesh, equations_parabolic, dg.mortar,
+                                     dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish divergence-stage MPI receive
+    @trixi_timeit timer() "finish MPI receive divergence" begin
+        finish_mpi_receive!(cache_p.mpi_cache, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    # MPI interface fluxes for divergence stage
+    @trixi_timeit timer() "MPI interface flux divergence" begin
+        calc_mpi_interface_flux_divergence!(cache_p.elements.surface_flux_values,
+                                            mesh, equations_parabolic,
+                                            dg, parabolic_scheme, cache_p)
+    end
+
+    # MPI mortar fluxes for divergence stage
+    @trixi_timeit timer() "MPI mortar flux divergence" begin
+        calc_mpi_mortar_flux_divergence!(cache_p.elements.surface_flux_values,
+                                         mesh, equations_parabolic, dg.mortar,
+                                         dg, parabolic_scheme, cache_p)
+    end
+
+    # Finish divergence-stage MPI send
+    @trixi_timeit timer() "finish MPI send divergence" begin
+        finish_mpi_send!(cache_p.mpi_cache)
+    end
+
+    #
+    # Stage 4: final assembly
+    #
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral!(du, u, mesh, equations_parabolic,
+                               dg.surface_integral, dg, cache_p)
+    end
+
+    @trixi_timeit timer() "Jacobian" begin
+        apply_jacobian_parabolic!(du, mesh, equations_parabolic, dg, cache_p)
+    end
+
+    @trixi_timeit timer() "source terms parabolic" begin
+        calc_sources_parabolic!(du, u, gradients, t, source_terms_parabolic,
+                                equations_parabolic, dg, cache_p)
+    end
+
+    return nothing
+end
+
+function calc_gradient!(gradients, u_transformed, t,
+                        mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                        equations_parabolic, boundary_conditions_parabolic,
+                        dg::DG, parabolic_scheme, cache)
+    # Reset gradients
+    @trixi_timeit timer() "reset gradients" begin
+        reset_gradients!(gradients, dg, cache)
+    end
+
+    # Calculate volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral_gradient!(gradients, u_transformed,
+                                       mesh, equations_parabolic, dg, cache)
+    end
+
+    # Prolong solution to interfaces.
+    # This reuses `prolong2interfaces` for the purely hyperbolic case.
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache, u_transformed, mesh,
+                            equations_parabolic, dg)
+    end
+
+    # Calculate interface fluxes for the gradients
+    @trixi_timeit timer() "interface flux" begin
+        @unpack surface_flux_values = cache.elements
+        calc_interface_flux_gradient!(surface_flux_values, mesh, equations_parabolic,
+                                      dg, parabolic_scheme, cache)
+    end
+
+    # Prolong solution to boundaries.
+    # This reuses `prolong2boundaries` for the purely hyperbolic case.
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache, u_transformed, mesh,
+                            equations_parabolic, dg)
+    end
+
+    # Calculate boundary fluxes
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux_gradient!(cache, t, boundary_conditions_parabolic,
+                                     mesh, equations_parabolic, dg.surface_integral,
+                                     dg)
+    end
+
+    # Prolong solution to mortars.
+    # This reuses `prolong2mortars` for the purely hyperbolic case.
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars!(cache, u_transformed, mesh, equations_parabolic,
+                         dg.mortar, dg)
+    end
+
+    # Calculate mortar fluxes
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux_gradient!(cache.elements.surface_flux_values,
+                                   mesh, equations_parabolic, dg.mortar,
+                                   dg, parabolic_scheme, cache)
+    end
+
+    return nothing
+end
+
+# This version is called during `calc_gradients!` and must be specialized because the
+# MAGNITUDE of the flux does NOT depend on the outward normal.
+# Thus, you don't need to scale by 2 (e.g., the scaling factor in the normals (and in the
+# contravariant vectors) along large/small elements across a non-conforming
+# interface in 2D) and flip the sign when storing the mortar fluxes back
+# into `surface_flux_values`.
+@inline function mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                                     mesh::Union{P4estMeshParallel{3},
+                                                                 T8codeMeshParallel{3}},
+                                                     equations_parabolic::AbstractEquationsParabolic,
+                                                     mortar_l2::LobattoLegendreMortarL2,
+                                                     dg::DGSEM, cache, mortar,
+                                                     fstar_primary,
+                                                     fstar_secondary, u_buffer,
+                                                     fstar_tmp)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    index_range = eachnode(dg)
+    # Copy solution small to small
+    small_indices = node_indices[1, mortar]
+    small_direction = indices2direction(small_indices)
+
+    for position in 1:4 # Loop over small elements
+        element = neighbor_ids[position, mortar]
+        for j in eachnode(dg), i in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                       i,
+                                                                                       j,
+                                                                                       position]
+            end
+        end
+    end
+
+    # Project small fluxes to large element.
+    multiply_dimensionwise!(u_buffer,
+                            mortar_l2.reverse_lower, mortar_l2.reverse_lower,
+                            view(fstar_secondary, .., 1), fstar_tmp)
+    add_multiply_dimensionwise!(u_buffer,
+                                mortar_l2.reverse_upper, mortar_l2.reverse_lower,
+                                view(fstar_secondary, .., 2), fstar_tmp)
+    add_multiply_dimensionwise!(u_buffer,
+                                mortar_l2.reverse_lower, mortar_l2.reverse_upper,
+                                view(fstar_secondary, .., 3), fstar_tmp)
+    add_multiply_dimensionwise!(u_buffer,
+                                mortar_l2.reverse_upper, mortar_l2.reverse_upper,
+                                view(fstar_secondary, .., 4), fstar_tmp)
+
+    # Copy interpolated flux values from buffer to large element face in the
+    # correct orientation.
+    # Note that the index of the small sides will always run forward but
+    # the index of the large side might need to run backwards for flipped sides.
+    large_element = neighbor_ids[5, mortar]
+    large_indices = node_indices[2, mortar]
+    large_direction = indices2direction(large_indices)
+    large_surface_indices = surface_indices(large_indices)
+
+    i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_surface_indices[1],
+                                                                           index_range)
+    j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_surface_indices[2],
+                                                                           index_range)
+
+    # Note that the indices of the small sides will always run forward but
+    # the large indices might need to run backwards for flipped sides.
+    i_large = i_large_start
+    j_large = j_large_start
+    for j in eachnode(dg)
+        for i in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                surface_flux_values[v, i_large, j_large, large_direction, large_element] = u_buffer[v,
+                                                                                                    i,
+                                                                                                    j]
+            end
+            i_large += i_large_step_i
+            j_large += j_large_step_i
+        end
+        i_large += i_large_step_j
+        j_large += j_large_step_j
+    end
+
+    return nothing
+end
+
+function calc_interface_flux_gradient!(surface_flux_values,
+                                       mesh::Union{P4estMeshParallel{3},
+                                                   T8codeMeshParallel{3}},
+                                       equations_parabolic,
+                                       dg::DG, parabolic_scheme, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get element and side information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+        primary_direction = indices2direction(primary_indices)
+
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        # Get element and side information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+        secondary_direction = indices2direction(secondary_indices)
+        secondary_surface_indices = surface_indices(secondary_indices)
+
+        # Get the surface indexing on the secondary element.
+        # Note that the indices of the primary side will always run forward but
+        # the secondary indices might need to run backwards for flipped sides.
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                           index_range)
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(primary_direction,
+                                                        contravariant_vectors,
+                                                        i_primary, j_primary, k_primary,
+                                                        primary_element)
+
+                calc_interface_flux_gradient!(surface_flux_values, mesh,
+                                              equations_parabolic,
+                                              dg, parabolic_scheme, cache,
+                                              interface, normal_direction, i, j,
+                                              primary_direction, primary_element,
+                                              i_secondary, j_secondary,
+                                              secondary_direction, secondary_element)
+
+                # Increment the primary element indices
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+                # Increment the secondary element surface indices
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+            end
+            # Increment the primary element indices
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+            # Increment the secondary element surface indices
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+        end
+    end
+
+    return nothing
+end
+
+# This version is used for parabolic gradient computations
+@inline function calc_interface_flux_gradient!(surface_flux_values,
+                                               mesh::Union{P4estMeshParallel{3},
+                                                           T8codeMeshParallel{3}},
+                                               equations_parabolic,
+                                               dg::DG, parabolic_scheme, cache,
+                                               interface_index, normal_direction,
+                                               primary_i_node_index,
+                                               primary_j_node_index,
+                                               primary_direction_index,
+                                               primary_element_index,
+                                               secondary_i_node_index,
+                                               secondary_j_node_index,
+                                               secondary_direction_index,
+                                               secondary_element_index)
+    @unpack u = cache.interfaces
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg,
+                                       primary_i_node_index, primary_j_node_index,
+                                       interface_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    for v in eachvariable(equations_parabolic)
+        surface_flux_values[v, primary_i_node_index, primary_j_node_index, primary_direction_index, primary_element_index] = flux_[v]
+        # No sign flip required for gradient calculation because for parabolic terms,
+        # the normals are not embedded in `flux_` for gradient computations.
+        surface_flux_values[v, secondary_i_node_index, secondary_j_node_index, secondary_direction_index, secondary_element_index] = flux_[v]
+    end
+
+    return nothing
+end
+
+function calc_mpi_interface_flux_gradient!(surface_flux_values,
+                                           mesh::Union{P4estMeshParallel{3},
+                                                       T8codeMeshParallel{3}},
+                                           equations_parabolic,
+                                           dg::DG, parabolic_scheme, cache)
+    @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachmpiinterface(dg, cache)
+        local_element = local_neighbor_ids[interface]
+        local_indices = node_indices[interface]
+        local_direction = indices2direction(local_indices)
+        local_side = local_sides[interface]
+
+        # Create the local i,j,k indexing on the local element used to pull normal direction information
+        i_element_start, i_element_step_i, i_element_step_j = index_to_start_step_3d(local_indices[1],
+                                                                                     index_range)
+        j_element_start, j_element_step_i, j_element_step_j = index_to_start_step_3d(local_indices[2],
+                                                                                     index_range)
+        k_element_start, k_element_step_i, k_element_step_j = index_to_start_step_3d(local_indices[3],
+                                                                                     index_range)
+
+        i_element = i_element_start
+        j_element = j_element_start
+        k_element = k_element_start
+
+        # Initiate the node indices to be used in the surface for loop,
+        # the surface flux storage must be indexed in alignment with the local element indexing
+        local_surface_indices = surface_indices(local_indices)
+        i_surface_start, i_surface_step_i, i_surface_step_j = index_to_start_step_3d(local_surface_indices[1],
+                                                                                     index_range)
+        j_surface_start, j_surface_step_i, j_surface_step_j = index_to_start_step_3d(local_surface_indices[2],
+                                                                                     index_range)
+        i_surface = i_surface_start
+        j_surface = j_surface_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(local_direction,
+                                                        contravariant_vectors,
+                                                        i_element, j_element, k_element,
+                                                        local_element)
+
+                calc_mpi_interface_flux_gradient!(surface_flux_values, mesh,
+                                                  equations_parabolic,
+                                                  dg, parabolic_scheme, cache,
+                                                  interface, normal_direction,
+                                                  i, j, local_side,
+                                                  i_surface, j_surface,
+                                                  local_direction, local_element)
+                # Increment local element indices to pull the normal direction
+                i_element += i_element_step_i
+                j_element += j_element_step_i
+                k_element += k_element_step_i
+                # Increment the surface node indices along the local element
+                i_surface += i_surface_step_i
+                j_surface += j_surface_step_i
+            end
+            # Increment local element indices to pull the normal direction
+            i_element += i_element_step_j
+            j_element += j_element_step_j
+            k_element += k_element_step_j
+            # Increment the surface node indices along the local element
+            i_surface += i_surface_step_j
+            j_surface += j_surface_step_j
+        end
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_interface_flux_gradient!(surface_flux_values,
+                                                   mesh::Union{P4estMeshParallel{3},
+                                                               T8codeMeshParallel{3}},
+                                                   equations_parabolic,
+                                                   dg::DG, parabolic_scheme, cache,
+                                                   interface_index, normal_direction,
+                                                   interface_i_node_index,
+                                                   interface_j_node_index, local_side,
+                                                   surface_i_node_index,
+                                                   surface_j_node_index,
+                                                   local_direction_index,
+                                                   local_element_index)
+    @unpack u = cache.mpi_interfaces
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg,
+                                       interface_i_node_index,
+                                       interface_j_node_index,
+                                       interface_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    for v in eachvariable(equations_parabolic)
+        surface_flux_values[v, surface_i_node_index, surface_j_node_index,
+        local_direction_index, local_element_index] = flux_[v]
+    end
+
+    return nothing
+end
+
+# This is the version used when calculating the divergence of the viscous fluxes.
+# Identical to weak-form volume integral/kernel for the purely hyperbolic case,
+# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
+function calc_volume_integral!(du, flux_viscous,
+                               mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                               equations_parabolic::AbstractEquationsParabolic,
+                               dg::DGSEM, cache)
+    (; derivative_hat) = dg.basis
+    (; contravariant_vectors) = cache.elements
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for element in eachelement(dg, cache)
+        # Calculate volume terms in one element
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            flux1 = get_node_vars(flux_viscous_x, equations_parabolic, dg,
+                                  i, j, k, element)
+            flux2 = get_node_vars(flux_viscous_y, equations_parabolic, dg,
+                                  i, j, k, element)
+            flux3 = get_node_vars(flux_viscous_z, equations_parabolic, dg,
+                                  i, j, k, element)
+
+            # Compute the contravariant flux by taking the scalar product of the
+            # first contravariant vector Ja^1 and the flux vector
+            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                        i, j, k, element)
+            contravariant_flux1 = Ja11 * flux1 + Ja12 * flux2 + Ja13 * flux3
+            for ii in eachnode(dg)
+                multiply_add_to_node_vars!(du, derivative_hat[ii, i],
+                                           contravariant_flux1,
+                                           equations_parabolic, dg, ii, j, k, element)
+            end
+
+            # Compute the contravariant flux by taking the scalar product of the
+            # second contravariant vector Ja^2 and the flux vector
+            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                        i, j, k, element)
+            contravariant_flux2 = Ja21 * flux1 + Ja22 * flux2 + Ja23 * flux3
+            for jj in eachnode(dg)
+                multiply_add_to_node_vars!(du, derivative_hat[jj, j],
+                                           contravariant_flux2,
+                                           equations_parabolic, dg, i, jj, k, element)
+            end
+
+            # Compute the contravariant flux by taking the scalar product of the
+            # second contravariant vector Ja^2 and the flux vector
+            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                        i, j, k, element)
+            contravariant_flux3 = Ja31 * flux1 + Ja32 * flux2 + Ja33 * flux3
+            for kk in eachnode(dg)
+                multiply_add_to_node_vars!(du, derivative_hat[kk, k],
+                                           contravariant_flux3,
+                                           equations_parabolic, dg, i, j, kk, element)
+            end
+        end
+    end
+
+    return nothing
+end
+
+# This is the version used when calculating the divergence of the viscous fluxes.
+# Specialization `flux_viscous::Tuple` needed to
+# avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_3d.jl
+# which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
+function prolong2interfaces!(cache, flux_viscous::Tuple,
+                             mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             dg::DG)
+    (; interfaces) = cache
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Copy solution data from the primary element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        # Note that in the current implementation, the interface will be
+        # "aligned at the primary element", i.e., the index of the primary side
+        # will always run forwards.
+        primary_element = interfaces.neighbor_ids[1, interface]
+        primary_indices = interfaces.node_indices[1, interface]
+        primary_direction = indices2direction(primary_indices)
+
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # this is the outward normal direction on the primary element
+                normal_direction = get_normal_direction(primary_direction,
+                                                        contravariant_vectors,
+                                                        i_primary, j_primary,
+                                                        k_primary,
+                                                        primary_element)
+
+                for v in eachvariable(equations_parabolic)
+                    # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
+                    flux_viscous = SVector(flux_viscous_x[v, i_primary, j_primary,
+                                                          k_primary, primary_element],
+                                           flux_viscous_y[v, i_primary, j_primary,
+                                                          k_primary, primary_element],
+                                           flux_viscous_z[v, i_primary, j_primary,
+                                                          k_primary, primary_element])
+
+                    interfaces.u[1, v, i, j, interface] = dot(flux_viscous,
+                                                              normal_direction)
+                end
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+            end
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+        end
+
+        # Copy solution data from the secondary element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        secondary_element = interfaces.neighbor_ids[2, interface]
+        secondary_indices = interfaces.node_indices[2, interface]
+        secondary_direction = indices2direction(secondary_indices)
+
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                           index_range)
+        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                           index_range)
+
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+        k_secondary = k_secondary_start
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # This is the outward normal direction on the secondary element.
+                # Here, we assume that normal_direction on the secondary element is
+                # the negative of normal_direction on the primary element.
+                normal_direction = get_normal_direction(secondary_direction,
+                                                        contravariant_vectors,
+                                                        i_secondary, j_secondary,
+                                                        k_secondary,
+                                                        secondary_element)
+
+                for v in eachvariable(equations_parabolic)
+                    # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
+                    flux_viscous = SVector(flux_viscous_x[v, i_secondary, j_secondary,
+                                                          k_secondary,
+                                                          secondary_element],
+                                           flux_viscous_y[v, i_secondary, j_secondary,
+                                                          k_secondary,
+                                                          secondary_element],
+                                           flux_viscous_z[v, i_secondary, j_secondary,
+                                                          k_secondary,
+                                                          secondary_element])
+                    # store the normal flux with respect to the primary normal direction,
+                    # which is the negative of the secondary normal direction
+                    interfaces.u[2, v, i, j, interface] = -dot(flux_viscous,
+                                                               normal_direction)
+                end
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+                k_secondary += k_secondary_step_i
+            end
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+            k_secondary += k_secondary_step_j
+        end
+    end
+
+    return nothing
+end
+
+# This version is used for divergence flux computations
+function calc_interface_flux!(surface_flux_values,
+                              mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                              equations_parabolic, dg::DG, parabolic_scheme,
+                              cache)
+    (; neighbor_ids, node_indices) = cache.interfaces
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get element and side index information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+        primary_direction_index = indices2direction(primary_indices)
+
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        # Get element and side index information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+        secondary_direction_index = indices2direction(secondary_indices)
+        secondary_surface_indices = surface_indices(secondary_indices)
+
+        # Initiate the secondary index to be used in the surface for loop.
+        # This index on the primary side will always run forward but
+        # the secondary index might need to run backwards for flipped sides.
+        # Get the surface indexing on the secondary element.
+        # Note that the indices of the primary side will always run forward but
+        # the secondary indices might need to run backwards for flipped sides.
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                           index_range)
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(primary_direction_index,
+                                                        contravariant_vectors,
+                                                        i_primary, j_primary, k_primary,
+                                                        primary_element)
+                # We prolong the viscous flux dotted with respect the outward normal on the
+                # primary element.
+                viscous_flux_normal_ll, viscous_flux_normal_rr = get_surface_node_vars(cache.interfaces.u,
+                                                                                       equations_parabolic,
+                                                                                       dg,
+                                                                                       i,
+                                                                                       j,
+                                                                                       interface)
+
+                flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                                       normal_direction, Divergence(),
+                                       equations_parabolic, parabolic_scheme)
+
+                for v in eachvariable(equations_parabolic)
+                    surface_flux_values[v, i, j, primary_direction_index, primary_element] = flux_[v]
+                    # Sign flip required for divergence calculation since the flux for the 
+                    # divergence involves the normal direction.
+                    surface_flux_values[v, i_secondary, j_secondary, secondary_direction_index, secondary_element] = -flux_[v]
+                end
+
+                # Increment the primary element indices
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+                # Increment the secondary element surface indices
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+            end
+            # Increment the primary element indices
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+            # Increment the secondary element surface indices
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+        end
+    end
+
+    return nothing
+end
+
+function prolong2mortars_divergence!(cache, flux_viscous,
+                                     mesh::Union{P4estMeshParallel{3},
+                                                 T8codeMeshParallel{3}},
+                                     equations_parabolic,
+                                     mortar_l2::LobattoLegendreMortarL2,
+                                     dg::DGSEM)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack fstar_tmp_threaded = cache
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Copy solution data from the small elements using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        small_indices = node_indices[1, mortar]
+        direction_index = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        for position in 1:4 # Loop over small elements
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+            element = neighbor_ids[position, mortar]
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction_index,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    for v in eachvariable(equations_parabolic)
+                        flux_viscous = SVector(flux_viscous_x[v, i_small, j_small,
+                                                              k_small, element],
+                                               flux_viscous_y[v, i_small, j_small,
+                                                              k_small, element],
+                                               flux_viscous_z[v, i_small, j_small,
+                                                              k_small, element])
+
+                        cache.mortars.u[1, v, position, i, j, mortar] = dot(flux_viscous,
+                                                                            normal_direction)
+                    end
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer to copy solution values of the large element in the correct orientation
+        # before interpolating
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # temporary buffer for projections
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Copy solution of large element face to buffer in the
+        # correct orientation
+        large_indices = node_indices[2, mortar]
+
+        i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_indices[1],
+                                                                               index_range)
+        j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_indices[2],
+                                                                               index_range)
+        k_large_start, k_large_step_i, k_large_step_j = index_to_start_step_3d(large_indices[3],
+                                                                               index_range)
+
+        i_large = i_large_start
+        j_large = j_large_start
+        k_large = k_large_start
+        element = neighbor_ids[5, mortar] # Large element
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(direction_index,
+                                                        contravariant_vectors,
+                                                        i_large, j_large, k_large,
+                                                        element)
+
+                for v in eachvariable(equations_parabolic)
+                    flux_viscous = SVector(flux_viscous_x[v, i_large, j_large, k_large,
+                                                          element],
+                                           flux_viscous_y[v, i_large, j_large, k_large,
+                                                          element],
+                                           flux_viscous_z[v, i_large, j_large, k_large,
+                                                          element])
+
+                    # We prolong the viscous flux dotted with respect the outward normal
+                    # on the small element. We scale by -1/2 here because the normal
+                    # direction on the large element is negative 2x that of the small
+                    # element (these normal directions are "scaled" by the surface Jacobian)
+                    u_buffer[v, i, j] = -0.5f0 * dot(flux_viscous, normal_direction)
+                end
+                i_large += i_large_step_i
+                j_large += j_large_step_i
+                k_large += k_large_step_i
+            end
+            i_large += i_large_step_j
+            j_large += j_large_step_j
+            k_large += k_large_step_j
+        end
+
+        # Interpolate large element face data from buffer to small face locations
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 1, :, :, mortar),
+                                mortar_l2.forward_lower,
+                                mortar_l2.forward_lower,
+                                u_buffer, fstar_tmp)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 2, :, :, mortar),
+                                mortar_l2.forward_upper,
+                                mortar_l2.forward_lower,
+                                u_buffer, fstar_tmp)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 3, :, :, mortar),
+                                mortar_l2.forward_lower,
+                                mortar_l2.forward_upper,
+                                u_buffer, fstar_tmp)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 4, :, :, mortar),
+                                mortar_l2.forward_upper,
+                                mortar_l2.forward_upper,
+                                u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+function prolong2mpimortars_divergence!(cache, flux_viscous,
+                                        mesh::Union{P4estMeshParallel{3},
+                                                    T8codeMeshParallel{3}},
+                                        equations_parabolic,
+                                        mortar_l2::LobattoLegendreMortarL2,
+                                        dg::DGSEM)
+    @unpack node_indices = cache.mpi_mortars
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for mortar in eachmpimortar(dg, cache)
+        local_neighbor_ids = cache.mpi_mortars.local_neighbor_ids[mortar]
+        local_neighbor_positions = cache.mpi_mortars.local_neighbor_positions[mortar]
+
+        # Small side indexing
+        small_indices = node_indices[1, mortar]
+        direction_index = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        # Large side indexing
+        large_indices = node_indices[2, mortar]
+
+        i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_indices[1],
+                                                                               index_range)
+        j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_indices[2],
+                                                                               index_range)
+        k_large_start, k_large_step_i, k_large_step_j = index_to_start_step_3d(large_indices[3],
+                                                                               index_range)
+
+        for (element, position) in zip(local_neighbor_ids, local_neighbor_positions)
+            if position == 5
+                # =========================
+                # LARGE ELEMENT
+                # =========================
+                u_buffer = cache.u_threaded[Threads.threadid()]
+                fstar_tmp = cache.fstar_tmp_threaded[Threads.threadid()]
+
+                i_large = i_large_start
+                j_large = j_large_start
+                k_large = k_large_start
+
+                for j in eachnode(dg)
+                    for i in eachnode(dg)
+                        normal_direction = get_normal_direction(direction_index,
+                                                                contravariant_vectors,
+                                                                i_large, j_large,
+                                                                k_large,
+                                                                element)
+
+                        for v in eachvariable(equations_parabolic)
+                            flux_node = SVector(flux_viscous_x[v, i_large, j_large,
+                                                               k_large, element],
+                                                flux_viscous_y[v, i_large, j_large,
+                                                               k_large, element],
+                                                flux_viscous_z[v, i_large, j_large,
+                                                               k_large, element])
+
+                            # same convention as local code
+                            u_buffer[v, i, j] = -0.5f0 *
+                                                dot(flux_node, normal_direction)
+                        end
+
+                        i_large += i_large_step_i
+                        j_large += j_large_step_i
+                        k_large += k_large_step_i
+                    end
+                    i_large += i_large_step_j
+                    j_large += j_large_step_j
+                    k_large += k_large_step_j
+                end
+
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 1, :, :,
+                                             mortar),
+                                        mortar_l2.forward_lower,
+                                        mortar_l2.forward_lower,
+                                        u_buffer, fstar_tmp)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 2, :, :,
+                                             mortar),
+                                        mortar_l2.forward_upper,
+                                        mortar_l2.forward_lower,
+                                        u_buffer, fstar_tmp)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 3, :, :,
+                                             mortar),
+                                        mortar_l2.forward_lower,
+                                        mortar_l2.forward_upper,
+                                        u_buffer, fstar_tmp)
+                multiply_dimensionwise!(view(cache.mpi_mortars.u, 2, :, 4, :, :,
+                                             mortar),
+                                        mortar_l2.forward_upper,
+                                        mortar_l2.forward_upper,
+                                        u_buffer, fstar_tmp)
+
+            else
+                # =========================
+                # SMALL ELEMENT (1–4)
+                # =========================
+                i_small = i_small_start
+                j_small = j_small_start
+                k_small = k_small_start
+
+                for j in eachnode(dg)
+                    for i in eachnode(dg)
+                        normal_direction = get_normal_direction(direction_index,
+                                                                contravariant_vectors,
+                                                                i_small, j_small,
+                                                                k_small,
+                                                                element)
+
+                        for v in eachvariable(equations_parabolic)
+                            flux_node = SVector(flux_viscous_x[v, i_small, j_small,
+                                                               k_small, element],
+                                                flux_viscous_y[v, i_small, j_small,
+                                                               k_small, element],
+                                                flux_viscous_z[v, i_small, j_small,
+                                                               k_small, element])
+
+                            cache.mpi_mortars.u[1, v, position, i, j, mortar] = dot(flux_node,
+                                                                                    normal_direction)
+                        end
+
+                        i_small += i_small_step_i
+                        j_small += j_small_step_i
+                        k_small += k_small_step_i
+                    end
+                    i_small += i_small_step_j
+                    j_small += j_small_step_j
+                    k_small += k_small_step_j
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_mortar_flux_divergence!(surface_flux_values,
+                                      mesh::Union{P4estMeshParallel{3},
+                                                  T8codeMeshParallel{3}},
+                                      equations_parabolic,
+                                      mortar_l2::LobattoLegendreMortarL2,
+                                      dg::DG, parabolic_scheme, cache)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack contravariant_vectors = cache.elements
+    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
+    index_range = eachnode(dg)
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar = fstar_primary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Get index information on the small elements
+        small_indices = node_indices[1, mortar]
+        small_direction = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        for position in 1:4 # Loop over small elements
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+            element = neighbor_ids[position, mortar]
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(small_direction,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    for v in eachvariable(equations_parabolic)
+                        viscous_flux_normal_ll = cache.mortars.u[1, v, position, i, j,
+                                                                 mortar]
+                        viscous_flux_normal_rr = cache.mortars.u[2, v, position, i, j,
+                                                                 mortar]
+
+                        flux_ = flux_parabolic(viscous_flux_normal_ll,
+                                               viscous_flux_normal_rr,
+                                               normal_direction, Divergence(),
+                                               equations_parabolic, parabolic_scheme)
+
+                        # Sign flip (and scaling by 0.5) already handled above in `prolong2mortars_divergence!`
+                        fstar[v, i, j, position] = flux_
+                    end
+
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer to interpolate flux values of the large element to before
+        # copying in the correct orientation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # this reuses the hyperbolic version of `mortar_fluxes_to_elements!`
+        mortar_fluxes_to_elements!(surface_flux_values, mesh,
+                                   equations_parabolic, mortar_l2, dg, cache,
+                                   mortar, fstar, fstar, u_buffer,
+                                   fstar_tmp)
+    end
+
+    return nothing
+end
+
+function calc_mpi_interface_flux_divergence!(surface_flux_values,
+                                             mesh::Union{P4estMeshParallel{3},
+                                                         T8codeMeshParallel{3}},
+                                             equations_parabolic,
+                                             dg::DG, parabolic_scheme, cache)
+    @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachmpiinterface(dg, cache)
+        local_element = local_neighbor_ids[interface]
+        local_indices = node_indices[interface]
+        local_direction = indices2direction(local_indices)
+        local_side = local_sides[interface]
+
+        i_element_start, i_element_step_i, i_element_step_j = index_to_start_step_3d(local_indices[1],
+                                                                                     index_range)
+        j_element_start, j_element_step_i, j_element_step_j = index_to_start_step_3d(local_indices[2],
+                                                                                     index_range)
+        k_element_start, k_element_step_i, k_element_step_j = index_to_start_step_3d(local_indices[3],
+                                                                                     index_range)
+
+        i_element = i_element_start
+        j_element = j_element_start
+        k_element = k_element_start
+
+        local_surface_indices = surface_indices(local_indices)
+        i_surface_start, i_surface_step_i, i_surface_step_j = index_to_start_step_3d(local_surface_indices[1],
+                                                                                     index_range)
+        j_surface_start, j_surface_step_i, j_surface_step_j = index_to_start_step_3d(local_surface_indices[2],
+                                                                                     index_range)
+
+        i_surface = i_surface_start
+        j_surface = j_surface_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(local_direction,
+                                                        contravariant_vectors,
+                                                        i_element, j_element, k_element,
+                                                        local_element)
+
+                calc_mpi_interface_flux_divergence!(surface_flux_values, mesh,
+                                                    equations_parabolic,
+                                                    dg, parabolic_scheme, cache,
+                                                    interface, normal_direction,
+                                                    i, j, local_side,
+                                                    i_surface, j_surface,
+                                                    local_direction, local_element)
+
+                i_element += i_element_step_i
+                j_element += j_element_step_i
+                k_element += k_element_step_i
+
+                i_surface += i_surface_step_i
+                j_surface += j_surface_step_i
+            end
+
+            i_element += i_element_step_j
+            j_element += j_element_step_j
+            k_element += k_element_step_j
+
+            i_surface += i_surface_step_j
+            j_surface += j_surface_step_j
+        end
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_interface_flux_divergence!(surface_flux_values,
+                                                     mesh::Union{P4estMeshParallel{3},
+                                                                 T8codeMeshParallel{3}},
+                                                     equations_parabolic,
+                                                     dg::DG, parabolic_scheme, cache,
+                                                     interface_index, normal_direction,
+                                                     interface_i_node_index,
+                                                     interface_j_node_index, local_side,
+                                                     surface_i_node_index,
+                                                     surface_j_node_index,
+                                                     local_direction_index,
+                                                     local_element_index)
+    @unpack u = cache.mpi_interfaces
+
+    viscous_flux_normal_ll, viscous_flux_normal_rr = get_surface_node_vars(u,
+                                                                           equations_parabolic,
+                                                                           dg,
+                                                                           interface_i_node_index,
+                                                                           interface_j_node_index,
+                                                                           interface_index)
+
+    flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                           normal_direction, Divergence(),
+                           equations_parabolic, parabolic_scheme)
+
+    dirFactor = (local_side == 1) ? 1 : -1
+
+    for v in eachvariable(equations_parabolic)
+        surface_flux_values[v, surface_i_node_index, surface_j_node_index,
+        local_direction_index, local_element_index] = dirFactor .* flux_[v]
+    end
+
+    return nothing
+end
+
+function calc_mpi_mortar_flux_divergence!(surface_flux_values,
+                                          mesh::Union{P4estMeshParallel{3},
+                                                      T8codeMeshParallel{3}},
+                                          equations_parabolic,
+                                          mortar_l2::LobattoLegendreMortarL2,
+                                          dg::DG, parabolic_scheme, cache)
+    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
+
+    @threaded for mortar in eachmpimortar(dg, cache)
+        fstar = fstar_primary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        for position in 1:4
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(cache.mpi_mortars, i, j,
+                                                            position, mortar)
+
+                    calc_mpi_mortar_flux_divergence!(fstar, mesh,
+                                                     equations_parabolic,
+                                                     dg, parabolic_scheme, cache,
+                                                     mortar, position,
+                                                     normal_direction, i, j)
+                end
+            end
+        end
+
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        mpi_mortar_fluxes_to_elements_divergence!(surface_flux_values, mesh,
+                                                  equations_parabolic, mortar_l2,
+                                                  dg, cache,
+                                                  mortar, fstar, u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_mortar_flux_divergence!(fstar,
+                                                  mesh::Union{P4estMeshParallel{3},
+                                                              T8codeMeshParallel{3}},
+                                                  equations_parabolic,
+                                                  dg::DG, parabolic_scheme, cache,
+                                                  mortar_index, position_index,
+                                                  normal_direction,
+                                                  i_node_index, j_node_index)
+    @unpack u = cache.mpi_mortars
+
+    for v in eachvariable(equations_parabolic)
+        viscous_flux_normal_ll = u[1, v, position_index, i_node_index, j_node_index,
+                                   mortar_index]
+        viscous_flux_normal_rr = u[2, v, position_index, i_node_index, j_node_index,
+                                   mortar_index]
+
+        flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                               normal_direction, Divergence(),
+                               equations_parabolic, parabolic_scheme)
+
+        fstar[v, i_node_index, j_node_index, position_index] = flux_
+    end
+
+    return nothing
+end
+
+@inline function mpi_mortar_fluxes_to_elements_divergence!(surface_flux_values,
+                                                           mesh::Union{P4estMeshParallel{3},
+                                                                       T8codeMeshParallel{3}},
+                                                           equations_parabolic,
+                                                           mortar_l2::LobattoLegendreMortarL2,
+                                                           dg::DGSEM, cache, mortar,
+                                                           fstar, u_buffer, fstar_tmp)
+    @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
+
+    mpi_mortar_fluxes_to_elements!(surface_flux_values, mesh,
+                                   equations_parabolic, mortar_l2, dg, cache,
+                                   mortar, fstar, fstar, u_buffer, fstar_tmp)
+
+    return nothing
+end
+
+function calc_mortar_flux_gradient!(surface_flux_values,
+                                    mesh::Union{P4estMeshParallel{3},
+                                                T8codeMeshParallel{3}},
+                                    equations_parabolic,
+                                    mortar_l2::LobattoLegendreMortarL2,
+                                    dg::DG, parabolic_scheme, cache)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack contravariant_vectors = cache.elements
+    @unpack fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded = cache
+    index_range = eachnode(dg)
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar_primary = fstar_primary_threaded[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        # Get index information on the small elements
+        small_indices = node_indices[1, mortar]
+        small_direction = indices2direction(small_indices)
+
+        i_small_start, i_small_step_i, i_small_step_j = index_to_start_step_3d(small_indices[1],
+                                                                               index_range)
+        j_small_start, j_small_step_i, j_small_step_j = index_to_start_step_3d(small_indices[2],
+                                                                               index_range)
+        k_small_start, k_small_step_i, k_small_step_j = index_to_start_step_3d(small_indices[3],
+                                                                               index_range)
+
+        for position in 1:4
+            i_small = i_small_start
+            j_small = j_small_start
+            k_small = k_small_start
+            element = neighbor_ids[position, mortar]
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(small_direction,
+                                                            contravariant_vectors,
+                                                            i_small, j_small, k_small,
+                                                            element)
+
+                    calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                               mesh, equations_parabolic,
+                                               dg, parabolic_scheme, cache,
+                                               mortar, position, normal_direction,
+                                               i, j)
+
+                    i_small += i_small_step_i
+                    j_small += j_small_step_i
+                    k_small += k_small_step_i
+                end
+                i_small += i_small_step_j
+                j_small += j_small_step_j
+                k_small += k_small_step_j
+            end
+        end
+
+        # Buffer to interpolate flux values of the large element to before
+        # copying in the correct orientation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # in calc_interface_flux!, the interface flux is computed once over each
+        # interface using the normal from the "primary" element. The result is then
+        # passed back to the "secondary" element, flipping the sign to account for the
+        # change in the normal direction. For mortars, this sign flip occurs in
+        # "mortar_fluxes_to_elements_gradient!" instead.
+        mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                            mesh, equations_parabolic, mortar_l2,
+                                            dg, cache,
+                                            mortar, fstar_primary, fstar_secondary,
+                                            u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+# We structure `calc_mortar_flux_gradient!` similarly to "calc_mortar_flux!" for
+# hyperbolic equations with no nonconservative terms.
+# The reasoning is that parabolic fluxes are treated like conservative
+# terms (e.g., we compute a viscous conservative "flux") and thus no
+# non-conservative terms are present.
+@inline function calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                            mesh::Union{P4estMeshParallel{3},
+                                                        T8codeMeshParallel{3}},
+                                            equations_parabolic,
+                                            dg::DG, parabolic_scheme, cache,
+                                            mortar_index, position_index,
+                                            normal_direction,
+                                            i_node_index, j_node_index)
+    @unpack u = cache.mortars
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg, position_index,
+                                       i_node_index, j_node_index, mortar_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    # Copy flux to buffer
+    set_node_vars!(fstar_primary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+    # As in `calc_interface_flux_gradient!`, no sign flip is necessary for the fluxes used in gradient calculations
+    set_node_vars!(fstar_secondary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+
+    return nothing
+end
+
+function calc_mpi_mortar_flux_gradient!(surface_flux_values,
+                                        mesh::Union{P4estMeshParallel{3},
+                                                    T8codeMeshParallel{3}},
+                                        equations_parabolic,
+                                        mortar_l2::LobattoLegendreMortarL2,
+                                        dg::DG, parabolic_scheme, cache)
+    @unpack fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded = cache
+
+    @threaded for mortar in eachmpimortar(dg, cache)
+        fstar_primary = fstar_primary_threaded[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+
+        for position in 1:4
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(cache.mpi_mortars, i, j,
+                                                            position, mortar)
+
+                    calc_mpi_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                                   mesh, equations_parabolic,
+                                                   dg, parabolic_scheme, cache,
+                                                   mortar, position,
+                                                   normal_direction, i, j)
+                end
+            end
+        end
+
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        mpi_mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                                mesh, equations_parabolic, mortar_l2,
+                                                dg, cache,
+                                                mortar, fstar_primary, fstar_secondary,
+                                                u_buffer, fstar_tmp)
+    end
+
+    return nothing
+end
+
+@inline function calc_mpi_mortar_flux_gradient!(fstar_primary, fstar_secondary,
+                                                mesh::Union{P4estMeshParallel{3},
+                                                            T8codeMeshParallel{3}},
+                                                equations_parabolic,
+                                                dg::DG, parabolic_scheme, cache,
+                                                mortar_index, position_index,
+                                                normal_direction,
+                                                i_node_index, j_node_index)
+    @unpack u = cache.mpi_mortars
+
+    u_ll, u_rr = get_surface_node_vars(u, equations_parabolic, dg, position_index,
+                                       i_node_index, j_node_index, mortar_index)
+
+    flux_ = flux_parabolic(u_ll, u_rr, normal_direction, Gradient(),
+                           equations_parabolic, parabolic_scheme)
+
+    set_node_vars!(fstar_primary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+    set_node_vars!(fstar_secondary, flux_, equations_parabolic, dg,
+                   i_node_index, j_node_index, position_index)
+
+    return nothing
+end
+
+@inline function mpi_mortar_fluxes_to_elements_gradient!(surface_flux_values,
+                                                         mesh::Union{P4estMeshParallel{3},
+                                                                     T8codeMeshParallel{3}},
+                                                         equations_parabolic,
+                                                         mortar_l2::LobattoLegendreMortarL2,
+                                                         dg::DGSEM, cache, mortar,
+                                                         fstar_primary, fstar_secondary,
+                                                         u_buffer, fstar_tmp)
+    @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
+    index_range = eachnode(dg)
+
+    small_indices = node_indices[1, mortar]
+    small_direction = indices2direction(small_indices)
+    large_indices = node_indices[2, mortar]
+    large_direction = indices2direction(large_indices)
+    large_surface_indices = surface_indices(large_indices)
+
+    i_large_start, i_large_step_i, i_large_step_j = index_to_start_step_3d(large_surface_indices[1],
+                                                                           index_range)
+    j_large_start, j_large_step_i, j_large_step_j = index_to_start_step_3d(large_surface_indices[2],
+                                                                           index_range)
+
+    for (element, position) in zip(local_neighbor_ids[mortar],
+                                   local_neighbor_positions[mortar])
+        if position == 5
+            multiply_dimensionwise!(u_buffer,
+                                    mortar_l2.reverse_lower, mortar_l2.reverse_lower,
+                                    view(fstar_secondary, .., 1),
+                                    fstar_tmp)
+            add_multiply_dimensionwise!(u_buffer,
+                                        mortar_l2.reverse_upper,
+                                        mortar_l2.reverse_lower,
+                                        view(fstar_secondary, .., 2),
+                                        fstar_tmp)
+            add_multiply_dimensionwise!(u_buffer,
+                                        mortar_l2.reverse_lower,
+                                        mortar_l2.reverse_upper,
+                                        view(fstar_secondary, .., 3),
+                                        fstar_tmp)
+            add_multiply_dimensionwise!(u_buffer,
+                                        mortar_l2.reverse_upper,
+                                        mortar_l2.reverse_upper,
+                                        view(fstar_secondary, .., 4),
+                                        fstar_tmp)
+
+            # same sign/scale handling as local mortar_fluxes_to_elements_gradient!
+            #u_buffer .*= -4
+
+            i_large = i_large_start
+            j_large = j_large_start
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        surface_flux_values[v, i_large, j_large,
+                        large_direction, element] = u_buffer[v, i, j]
+                    end
+                    i_large += i_large_step_i
+                    j_large += j_large_step_i
+                end
+                i_large += i_large_step_j
+                j_large += j_large_step_j
+            end
+        else
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                               i,
+                                                                                               j,
+                                                                                               position]
+                    end
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_volume_integral_gradient!(gradients, u_transformed,
+                                        mesh::Union{P4estMeshParallel{3},
+                                                    T8codeMeshParallel{3}}, # for dispatch only
+                                        equations_parabolic::AbstractEquationsParabolic,
+                                        dg::DG, cache)
+    @unpack derivative_hat = dg.basis
+    @unpack contravariant_vectors = cache.elements
+    gradients_x, gradients_y, gradients_z = gradients
+
+    @threaded for element in eachelement(dg, cache)
+        # Calculate volume terms in one element,
+        # corresponds to `kernel` functions for the hyperbolic part of the flux
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            u_node = get_node_vars(u_transformed, equations_parabolic, dg,
+                                   i, j, k, element)
+
+            for ii in eachnode(dg)
+                multiply_add_to_node_vars!(gradients_x, derivative_hat[ii, i],
+                                           u_node, equations_parabolic, dg,
+                                           ii, j, k, element)
+            end
+
+            for jj in eachnode(dg)
+                multiply_add_to_node_vars!(gradients_y, derivative_hat[jj, j],
+                                           u_node, equations_parabolic, dg,
+                                           i, jj, k, element)
+            end
+
+            for kk in eachnode(dg)
+                multiply_add_to_node_vars!(gradients_z, derivative_hat[kk, k],
+                                           u_node, equations_parabolic, dg,
+                                           i, j, kk, element)
+            end
+        end
+
+        # now that the reference coordinate gradients are computed, transform them node-by-node to physical gradients
+        # using the contravariant vectors
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                        i, j, k, element)
+            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                        i, j, k, element)
+            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                        i, j, k, element)
+
+            gradients_reference_1 = get_node_vars(gradients_x, equations_parabolic, dg,
+                                                  i, j, k, element)
+            gradients_reference_2 = get_node_vars(gradients_y, equations_parabolic, dg,
+                                                  i, j, k, element)
+            gradients_reference_3 = get_node_vars(gradients_z, equations_parabolic, dg,
+                                                  i, j, k, element)
+
+            # note that the contravariant vectors are transposed compared with computations of flux
+            # divergences in `calc_volume_integral!`. See
+            # https://github.com/trixi-framework/Trixi.jl/pull/1490#discussion_r1213345190
+            # for a more detailed discussion.
+            gradient_x_node = Ja11 * gradients_reference_1 +
+                              Ja21 * gradients_reference_2 +
+                              Ja31 * gradients_reference_3
+            gradient_y_node = Ja12 * gradients_reference_1 +
+                              Ja22 * gradients_reference_2 +
+                              Ja32 * gradients_reference_3
+            gradient_z_node = Ja13 * gradients_reference_1 +
+                              Ja23 * gradients_reference_2 +
+                              Ja33 * gradients_reference_3
+
+            set_node_vars!(gradients_x, gradient_x_node, equations_parabolic, dg,
+                           i, j, k, element)
+            set_node_vars!(gradients_y, gradient_y_node, equations_parabolic, dg,
+                           i, j, k, element)
+            set_node_vars!(gradients_z, gradient_z_node, equations_parabolic, dg,
+                           i, j, k, element)
+        end
+    end
+
+    return nothing
+end
+
+# Specialization `flux_viscous::Tuple` needed to
+# avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_3d.jl
+# which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
+function prolong2boundaries!(cache, flux_viscous::Tuple,
+                             mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             dg::DG)
+    (; boundaries) = cache
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for boundary in eachboundary(dg, cache)
+        # Copy solution data from the element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+        direction = indices2direction(node_indices)
+
+        i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                            index_range)
+        j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                            index_range)
+        k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                            index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        k_node = k_node_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # this is the outward normal direction on the primary element
+                normal_direction = get_normal_direction(direction,
+                                                        contravariant_vectors,
+                                                        i_node, j_node, k_node, element)
+
+                for v in eachvariable(equations_parabolic)
+                    flux_viscous = SVector(flux_viscous_x[v, i_node, j_node, k_node,
+                                                          element],
+                                           flux_viscous_y[v, i_node, j_node, k_node,
+                                                          element],
+                                           flux_viscous_z[v, i_node, j_node, k_node,
+                                                          element])
+
+                    boundaries.u[v, i, j, boundary] = dot(flux_viscous,
+                                                          normal_direction)
+                end
+                i_node += i_node_step_i
+                j_node += j_node_step_i
+                k_node += k_node_step_i
+            end
+            i_node += i_node_step_j
+            j_node += j_node_step_j
+            k_node += k_node_step_j
+        end
+    end
+    return nothing
+end
+
+function calc_boundary_flux!(cache, t,
+                             boundary_condition_parabolic, # works with Dict types
+                             boundary_condition_indices,
+                             operator_type,
+                             mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             surface_integral, dg::DG)
+    (; boundaries) = cache
+    (; node_coordinates, surface_flux_values) = cache.elements
+    (; contravariant_vectors) = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for local_index in eachindex(boundary_condition_indices)
+        # Use the local index to get the global boundary index from the pre-sorted list
+        boundary_index = boundary_condition_indices[local_index]
+
+        # Get information on the adjacent element, compute the surface fluxes,
+        # and store them
+        element = boundaries.neighbor_ids[boundary_index]
+        node_indices = boundaries.node_indices[boundary_index]
+        direction_index = indices2direction(node_indices)
+
+        i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                            index_range)
+        j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                            index_range)
+        k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                            index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        k_node = k_node_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                # Extract solution data from boundary container
+                u_inner = get_node_vars(boundaries.u, equations_parabolic, dg, i, j,
+                                        boundary_index)
+
+                # Outward-pointing normal direction (not normalized)
+                normal_direction = get_normal_direction(direction_index,
+                                                        contravariant_vectors,
+                                                        i_node, j_node, k_node, element)
+
+                # TODO: revisit if we want more general boundary treatments.
+                # This assumes the gradient numerical flux at the boundary is the gradient variable,
+                # which is consistent with BR1, LDG.
+                flux_inner = u_inner
+
+                # Coordinates at boundary node
+                x = get_node_coords(node_coordinates, equations_parabolic, dg,
+                                    i_node, j_node, k_node, element)
+
+                flux_ = boundary_condition_parabolic(flux_inner, u_inner,
+                                                     normal_direction,
+                                                     x, t, operator_type,
+                                                     equations_parabolic)
+
+                # Copy flux to element storage in the correct orientation
+                for v in eachvariable(equations_parabolic)
+                    surface_flux_values[v, i, j, direction_index, element] = flux_[v]
+                end
+
+                i_node += i_node_step_i
+                j_node += j_node_step_i
+                k_node += k_node_step_i
+            end
+            i_node += i_node_step_j
+            j_node += j_node_step_j
+            k_node += k_node_step_j
+        end
+    end
+
+    return nothing
+end
+
+function calc_surface_integral_gradient!(gradients,
+                                         mesh::Union{P4estMeshParallel{3},
+                                                     T8codeMeshParallel{3}}, # for dispatch only
+                                         equations_parabolic::AbstractEquationsParabolic,
+                                         dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+    @unpack contravariant_vectors = cache.elements
+
+    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
+    # into FMAs (see comment at the top of the file).
+    factor = inverse_weights[1] # For LGL basis: Identical to weighted boundary interpolation at x = ±1
+    @threaded for element in eachelement(dg, cache)
+        for l in eachnode(dg), m in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                for dim in 1:3
+                    grad = gradients[dim]
+                    # surface at -x
+                    normal_direction = get_normal_direction(1, contravariant_vectors,
+                                                            1, l, m,
+                                                            element)
+                    grad[v, 1, l, m, element] = (grad[v,
+                                                      1, l, m,
+                                                      element] +
+                                                 surface_flux_values[v,
+                                                                     l, m, 1,
+                                                                     element] *
+                                                 factor *
+                                                 normal_direction[dim])
+
+                    # surface at +x
+                    normal_direction = get_normal_direction(2, contravariant_vectors,
+                                                            nnodes(dg), l, m,
+                                                            element)
+                    grad[v, nnodes(dg), l, m, element] = (grad[v,
+                                                               nnodes(dg), l, m,
+                                                               element] +
+                                                          surface_flux_values[v,
+                                                                              l, m, 2,
+                                                                              element] *
+                                                          factor *
+                                                          normal_direction[dim])
+
+                    # surface at -y
+                    normal_direction = get_normal_direction(3, contravariant_vectors,
+                                                            l, 1, m,
+                                                            element)
+                    grad[v, l, 1, m, element] = (grad[v,
+                                                      l, 1, m,
+                                                      element] +
+                                                 surface_flux_values[v,
+                                                                     l, m, 3,
+                                                                     element] *
+                                                 factor *
+                                                 normal_direction[dim])
+
+                    # surface at +y
+                    normal_direction = get_normal_direction(4, contravariant_vectors,
+                                                            l, nnodes(dg), m,
+                                                            element)
+                    grad[v, l, nnodes(dg), m, element] = (grad[v,
+                                                               l, nnodes(dg), m,
+                                                               element] +
+                                                          surface_flux_values[v,
+                                                                              l, m, 4,
+                                                                              element] *
+                                                          factor *
+                                                          normal_direction[dim])
+
+                    # surface at -z
+                    normal_direction = get_normal_direction(5, contravariant_vectors,
+                                                            l, m, 1,
+                                                            element)
+                    grad[v, l, m, 1, element] = (grad[v,
+                                                      l, m, 1,
+                                                      element] +
+                                                 surface_flux_values[v,
+                                                                     l, m, 5,
+                                                                     element] *
+                                                 factor *
+                                                 normal_direction[dim])
+
+                    # surface at +z
+                    normal_direction = get_normal_direction(6, contravariant_vectors,
+                                                            l, m, nnodes(dg),
+                                                            element)
+                    grad[v, l, m, nnodes(dg), element] = (grad[v,
+                                                               l, m, nnodes(dg),
+                                                               element] +
+                                                          surface_flux_values[v,
+                                                                              l, m, 6,
+                                                                              element] *
+                                                          factor *
+                                                          normal_direction[dim])
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+function prolong2mpiinterfaces!(cache, flux_viscous::Tuple,
+                                mesh::Union{P4estMeshParallel{3},
+                                            T8codeMeshParallel{3}},
+                                equations_parabolic, dg::DG)
+    @unpack local_neighbor_ids, node_indices, local_sides = cache.mpi_interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+
+    @threaded for interface in eachmpiinterface(dg, cache)
+        local_element = local_neighbor_ids[interface]
+        local_indices = node_indices[interface]
+        local_direction = indices2direction(local_indices)
+        local_side = local_sides[interface]
+        orientationFactor = local_side == 1 ? 1 : -1
+
+        i_start, i_step_i, i_step_j = index_to_start_step_3d(local_indices[1],
+                                                             index_range)
+        j_start, j_step_i, j_step_j = index_to_start_step_3d(local_indices[2],
+                                                             index_range)
+        k_start, k_step_i, k_step_j = index_to_start_step_3d(local_indices[3],
+                                                             index_range)
+
+        i_elem = i_start
+        j_elem = j_start
+        k_elem = k_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                normal_direction = get_normal_direction(local_direction,
+                                                        contravariant_vectors,
+                                                        i_elem, j_elem, k_elem,
+                                                        local_element)
+
+                for v in eachvariable(equations_parabolic)
+                    flux_viscous = SVector(flux_viscous_x[v, i_elem, j_elem, k_elem,
+                                                          local_element],
+                                           flux_viscous_y[v, i_elem, j_elem, k_elem,
+                                                          local_element],
+                                           flux_viscous_z[v, i_elem, j_elem, k_elem,
+                                                          local_element])
+
+                    cache.mpi_interfaces.u[local_side, v, i, j, interface] = orientationFactor .*
+                                                                             dot(flux_viscous,
+                                                                                 normal_direction)
+                end
+
+                i_elem += i_step_i
+                j_elem += j_step_i
+                k_elem += k_step_i
+            end
+
+            i_elem += i_step_j
+            j_elem += j_step_j
+            k_elem += k_step_j
+        end
+    end
+
+    return nothing
+end
+
+# Needed to *not* flip the sign of the inverse Jacobian.
+# This is because the parabolic fluxes are assumed to be of the form
+#   `du/dt + df/dx = dg/dx + source(x,t)`,
+# where f(u) is the inviscid flux and g(u) is the viscous flux.
+function apply_jacobian_parabolic!(du::AbstractArray,
+                                   mesh::Union{P4estMeshParallel{3},
+                                               T8codeMeshParallel{3}},
+                                   equations_parabolic::AbstractEquationsParabolic,
+                                   dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            factor = inverse_jacobian[i, j, k, element]
+
+            for v in eachvariable(equations_parabolic)
+                du[v, i, j, k, element] *= factor
+            end
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index e7f97d96933..372f3cc81ea 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -264,6 +264,28 @@ function create_cache(mesh::P4estMeshParallel, equations::AbstractEquations, dg:
     return cache
 end
 
+function create_cache_parabolic(mesh::P4estMeshParallel,
+                                equations_hyperbolic::AbstractEquations,
+                                dg::DG, n_elements, uEltype)
+
+    # Keep mesh state consistent, just like the hyperbolic parallel cache
+    balance!(mesh)
+    partition!(mesh)
+    update_ghost_layer!(mesh)
+
+    # Local parabolic containers
+    elements = init_elements(mesh, equations_hyperbolic, dg.basis, uEltype)
+    interfaces = init_interfaces(mesh, equations_hyperbolic, dg.basis, elements)
+    boundaries = init_boundaries(mesh, equations_hyperbolic, dg.basis, elements)
+
+    # --- Minimal viscous container (ONLY what is already used downstream)
+    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
+                                                  nnodes(dg), n_elements,
+                                                  uEltype)
+
+    return (; elements, interfaces, boundaries, viscous_container)
+end
+
 function init_mpi_cache(mesh::P4estMeshParallel, mpi_interfaces, mpi_mortars, nvars,
                         nnodes, uEltype)
     mpi_cache = P4estMPICache(uEltype)
diff --git a/src/solvers/dgsem_tree/dg_3d_parabolic.jl b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
index 79b9aaddcec..ab334ebe53d 100644
--- a/src/solvers/dgsem_tree/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
@@ -8,7 +8,7 @@
 # This method is called when a `SemidiscretizationHyperbolicParabolic` is constructed.
 # It constructs the basic `cache` used throughout the simulation to compute
 # the RHS etc.
-function create_cache_parabolic(mesh::Union{TreeMesh{3}, P4estMesh{3}},
+function create_cache_parabolic(mesh::Union{TreeMesh{3}},
                                 equations_hyperbolic::AbstractEquations,
                                 dg::DG, n_elements, uEltype)
     viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
diff --git a/src/solvers/dgsem_tree/indicators_1d.jl b/src/solvers/dgsem_tree/indicators_1d.jl
index d72fdd65857..2bdcece8acf 100644
--- a/src/solvers/dgsem_tree/indicators_1d.jl
+++ b/src/solvers/dgsem_tree/indicators_1d.jl
@@ -120,7 +120,7 @@ function create_cache(::Union{Type{IndicatorLöhner}, Type{IndicatorMax}},
 end
 
 # this method is used when the indicator is constructed as for AMR
-function create_cache(typ::Union{Type{IndicatorLöhner}, Type{IndicatorMax}},
+function create_cache(typ::Union{Type{IndicatorLöhner}, Type{IndicatorMax},Type{IndicatorPositional}},
                       mesh, equations::AbstractEquations, dg::DGSEM, cache)
     return create_cache(typ, equations, dg.basis)
 end
diff --git a/src/solvers/dgsem_tree/indicators_3d.jl b/src/solvers/dgsem_tree/indicators_3d.jl
index 05ba46940c6..369478f2a74 100644
--- a/src/solvers/dgsem_tree/indicators_3d.jl
+++ b/src/solvers/dgsem_tree/indicators_3d.jl
@@ -185,6 +185,16 @@ function create_cache(::Union{Type{IndicatorLöhner}, Type{IndicatorMax}},
     return (; alpha, indicator_threaded)
 end
 
+function create_cache(::Type{IndicatorPositional},
+                      equations::AbstractEquations{3}, basis::LobattoLegendreBasis)
+    uEltype = real(basis)
+    alpha = Vector{uEltype}()
+
+    center_threaded = [MVector{3, uEltype}(undef) for _ in 1:Threads.maxthreadid()]
+
+    return (; alpha, center_threaded)
+end
+
 function (löhner::IndicatorLöhner)(u::AbstractArray{<:Any, 5},
                                    mesh, equations, dg::DGSEM, cache;
                                    kwargs...)
diff --git a/test/test_mpi_p4est_3d.jl b/test/test_mpi_p4est_3d.jl
index 11086b7f593..7da623d94b1 100644
--- a/test/test_mpi_p4est_3d.jl
+++ b/test/test_mpi_p4est_3d.jl
@@ -250,7 +250,63 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
-end
+
+    @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl" begin
+        @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                     "elixir_navierstokes_taylor_green_vortex.jl"),
+                            initial_refinement_level=2, tspan=(0.0, 0.25),
+                            surface_flux=FluxHLL(min_max_speed_naive),
+                            l2=[
+                                0.0001547509861140407,
+                                0.015637861347119624,
+                                0.015637861347119687,
+                                0.022024699158522523,
+                                0.009711013505930812
+                            ],
+                            linf=[
+                                0.0006696415247340326,
+                                0.03442565722527785,
+                                0.03442565722577423,
+                                0.06295407168705314,
+                                0.032857472756916195
+                            ])
+        # Ensure that we do not have excessive memory allocations
+        # (e.g., from type instabilities)
+        @test_allocations(Trixi.rhs!, semi, sol, 1000)
+        @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+    end
+
+    @trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl (Diffusive CFL)" begin
+        @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                     "elixir_navierstokes_taylor_green_vortex.jl"),
+                            tspan=(0.0, 0.1),
+                            mu=0.5, # render flow diffusion-dominated
+                            callbacks=CallbackSet(summary_callback, analysis_callback,
+                                                  alive_callback,
+                                                  StepsizeCallback(cfl = 2.3,
+                                                                   cfl_diffusive = 0.4)),
+                            adaptive=false, # respect CFL
+                            ode_alg=CKLLSRK95_4S(),
+                            l2=[
+                                0.0001022410497625877,
+                                0.04954975879887512,
+                                0.049549758798875056,
+                                0.005853983721675305,
+                                0.09161121143324424
+                            ],
+                            linf=[
+                                0.00039284994602417633,
+                                0.14026307274342587,
+                                0.14026307274350203,
+                                0.017003338595870714,
+                                0.2823457296549634
+                            ])
+        # Ensure that we do not have excessive memory allocations
+        # (e.g., from type instabilities)
+        @test_allocations(Trixi.rhs!, semi, sol, 1000)
+        @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+    end
+end # 3D Testcases 
 end # P4estMesh MPI
 
 end # module
diff --git a/test/test_parabolic_3d.jl b/test/test_parabolic_3d.jl
index f8aa7e85afa..492981f3561 100644
--- a/test/test_parabolic_3d.jl
+++ b/test/test_parabolic_3d.jl
@@ -439,6 +439,33 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex_amr_mortar.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
+                                 "elixir_navierstokes_taylor_green_vortex_amr_mortar.jl"),
+                        tspan=(0.0, 5.0),
+                        l2=[
+                            0.001716903538135598,
+                            0.2566781540081305,
+                            0.2566781540081202,
+                            0.22556320347015937,
+                            0.33779107381452556
+                        ],
+                        linf=[
+                            0.01139265672066836,
+                            1.4145605813055888,
+                            1.4145605813076223,
+                            1.070081196321056,
+                            3.332907472310012
+                        ])
+
+    @test isapprox(last(summary_callback.analysis_integrals[:enstrophy]),
+                   1.6598233348361535;
+                   rtol = 1.0e-12, atol = 1.0e-14)
+
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh3D: elixir_advection_diffusion_amr.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_3d_dgsem",
                                  "elixir_advection_diffusion_amr.jl"),