compiler: fix for kernel

mloubout · mloubout · commit 7209a6256d10 · 2026-06-25T23:12:48.000-04:00
diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py
@@ -472,9 +472,19 @@ def callback(self, clusters, prefix, seen=None):
             # Construct the HaloTouch Cluster
             expr = Eq(self.B, HaloTouch(*points, halo_scheme=hs))
 
-            key0 = lambda i: i in prefix[:-1] or i in hs.loc_indices  # noqa: B023
+            # The HaloTouch only needs to be scheduled at the outermost
+            # level the halo'd data depends on -- typically the time loop
+            # and the sub-iterators (``loc_indices``) that index it. Any
+            # outer Dimension whose iteration is *independent* of the
+            # halo (e.g. ``p_rec`` for an interpolation reading ``u``
+            # along the radius nest) shouldn't be in the HaloTouch's
+            # ispace, otherwise its ``sequentialize()``d properties veto
+            # blocking on the real clusters it sits alongside.
+            relevant = (set(hs.loc_indices) |
+                        set().union(*(i._defines for i in hs.loc_indices)))
+            key0 = lambda i: i in prefix[:-1] and i._defines & relevant  # noqa: B023
             key1 = lambda i: not i._defines & set(hs.distributed_defined)  # noqa: B023
-            key = lambda i: key0(i) and key1(i)  # noqa: B023
+            key = lambda i: (key0(i) or i in hs.loc_indices) and key1(i)  # noqa: B023
             ispace = c.ispace.project(key)
 
             properties = c.properties.sequentialize()
diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py
@@ -34,7 +34,7 @@ class EqBlock(CacheInstances):
     @classmethod
     def _preprocess_args(cls, exprs, ispace=null_ispace, guards=None,
                          properties=None, syncs=None, halo_scheme=None):
-        exprs = tuple(ClusterizedEq(e, ispace=ispace) for e in as_tuple(exprs))
+        exprs = tuple(clusterize_eq(e, ispace=ispace) for e in as_tuple(exprs))
         guards = Guards(guards or {})
         properties = Properties(properties or {})
         syncs = normalize_syncs(syncs or {})
diff --git a/devito/ir/equations/equation.py b/devito/ir/equations/equation.py
@@ -208,6 +208,19 @@ def detect(cls, expr):
             (ReduceMin, OpMin),
             (ReduceMax, OpMax),
             (Inc, OpInc),
+            # An ``Interpolation`` looks like a plain ``Eq`` -- ``sf[p_*] =
+            # expr[rp_*]`` -- but the cluster scheduler iterates the rhs
+            # over the radius dims, so values are implicitly summed across
+            # ``rp_*``. Tagging it as ``OpInc`` makes the dependence
+            # analysis treat ``rp_*`` as reduction dims
+            # (``parallel_if_atomic``), which matches the lowered code
+            # (``sumrec += ... ; sf[p_*] = sumrec``) and stops the
+            # blocking heuristic from turning the tiny radius loops into
+            # thread blocks. The actual write-back flavour at ``sf[p_*]``
+            # is keyed off the Eq's class (``is_increment_writeback``) in
+            # ``lower_sparse_ops`` so this tag doesn't accidentally turn
+            # ``Interpolation`` assignments into increments.
+            (InterpolationMixin, OpInc),
         )
         for kls, op in reduction_mapper:
             if isinstance(expr, kls):
@@ -366,12 +379,17 @@ class LoweredSparseEq(SparseOpMixin, LoweredEq):
 
 class LoweredInterpolation(InterpolationMixin, LoweredSparseEq):
     """IR counterpart of ``Interpolation``."""
-    pass
+    # ``sf[p_*] = ...``: the write-back at the sparse position is an
+    # assignment. The Eq is still tagged as a reduction
+    # (``OpInc``/``is_Reduction``) because the rhs is summed over the
+    # radius dims; only the *outer* write-back to ``sf[p_*]`` is plain.
+    is_increment_writeback = False
 
 
 class LoweredIncrInterpolation(InterpolationMixin, LoweredSparseEq):
     """IR counterpart of ``IncrInterpolation``."""
-    pass
+    # ``sf[p_*] += ...``: the user asked for ``interpolate(..., increment=True)``.
+    is_increment_writeback = True
 
 
 class LoweredInjection(InjectionMixin, LoweredSparseEq):
@@ -458,12 +476,12 @@ class ClusterizedSparseEq(SparseOpMixin, ClusterizedEq):
 
 class ClusterizedInterpolation(InterpolationMixin, ClusterizedSparseEq):
     """Frozen counterpart of ``LoweredInterpolation``."""
-    pass
+    is_increment_writeback = False
 
 
 class ClusterizedIncrInterpolation(InterpolationMixin, ClusterizedSparseEq):
     """Frozen counterpart of ``LoweredIncrInterpolation``."""
-    pass
+    is_increment_writeback = True
 
 
 class ClusterizedInjection(InjectionMixin, ClusterizedSparseEq):
diff --git a/devito/passes/iet/sparse.py b/devito/passes/iet/sparse.py
@@ -28,8 +28,8 @@
 from devito.ir.equations import DummyEq
 from devito.ir.equations.algorithms import lower_exprs
 from devito.ir.iet import (
-    Call, EntryFunction, Expression, FindNodes, HaloSpot, Increment, Iteration, List,
-    Transformer, make_callable
+    Call, Conditional, EntryFunction, Expression, ExpressionBundle, FindNodes, HaloSpot,
+    Increment, Iteration, List, Transformer, make_callable
 )
 from devito.passes.iet.engine import iet_pass
 from devito.types import Eq, InjectionMixin, InterpolationMixin, Symbol
@@ -120,7 +120,10 @@ def lower_sparse_ops(iet, sregistry=None, **kwargs):
 def _find_outer_iteration(iet, expr):
     """
     Walk up the IET from ``expr`` and return the outermost Iteration
-    whose ``dim.root`` is the SparseFunction's sparse Dimension.
+    whose ``dim.root`` is the SparseFunction's sparse Dimension. This
+    is the entry point of the sparse-op nest in the parent IET; the
+    full nest (including any block Iterations the cluster pipeline
+    wrapped around the sparse loop) gets extracted into the efunc.
     """
     sparse_dim = expr.expr.interpolator.sfunction._sparse_dim
     for it in FindNodes(Iteration).visit(iet):
@@ -146,6 +149,15 @@ def _materialise_nest(nest, exprs):
     interpolation Expression wrap it with the scalar accumulator
     pattern. Multiple sparse-op Expressions sharing the same outer
     Iteration are materialised in one pass and reuse the same temps.
+
+    ``nest`` is the *outermost* sparse-Dimension Iteration, so that the
+    whole block-Iteration hierarchy (e.g. ``p_rec0_blk0`` -> ``p_rec``
+    on the GPU pipeline) is extracted into the efunc and downstream GPU
+    kernel synthesis can fold the block loops into a thread-grid
+    wrapping the kernel body. The temps and the accumulator pattern,
+    however, must live *inside* the innermost sparse Iteration -- one
+    set per sparse point, sitting beneath any thread-index/OOB-guard
+    prelude that the GPU kernel prep may have inserted.
     """
     # Position + coefficient temporaries as IET Expressions. These are
     # the same for every Expression in the group, so we emit them once.
@@ -155,24 +167,63 @@ def _materialise_nest(nest, exprs):
     temp_exprs = tuple(Expression(DummyEq(e.lhs, e.rhs))
                        for e in lower_exprs(sample.sparse_temps()))
 
-    # The radius nest is what runs once per sparse point. For each
-    # interpolation Expression in the group, build its
-    # accumulator-wrapped copy of the radius nest. Injection Exprs
-    # share a single copy of the radius nest (their ``Inc`` already
-    # carries the right ``weights * rhs`` form).
-    inner = nest.nodes[0] if len(nest.nodes) == 1 else List(body=nest.nodes)
+    # Find the innermost sparse-Dimension Iteration within ``nest`` --
+    # that's where the head Expressions actually live, beneath any block
+    # Iterations that the cluster pipeline wrapped around the sparse
+    # loop.
+    sparse_dim = sample.interpolator.sfunction._sparse_dim
+    inner_iter = nest
+    for it in FindNodes(Iteration).visit(nest):
+        if it.dim.root is sparse_dim and \
+                any(e in FindNodes(Expression).visit(it) for e in exprs):
+            inner_iter = it
+
+    # ``inner_iter`` may carry a GPU kernel prelude (thread-index
+    # ``ExpressionBundle`` and OOB ``Conditional``) that downstream
+    # kernel synthesis expects to find at the top of the block dim's
+    # body. The temps and the accumulator pattern go *after* that
+    # prelude.
+    head, body_nodes = _split_kernel_prelude(inner_iter.nodes)
+
+    radius_nest = body_nodes[0] if len(body_nodes) == 1 else List(body=body_nodes)
     interp_exprs = [e for e in exprs if isinstance(e.expr, InterpolationMixin)]
     inject_exprs = [e for e in exprs if isinstance(e.expr, InjectionMixin)]
 
-    body = []
+    new_body = []
     for expr in interp_exprs:
         siblings = [e for e in exprs if e is not expr]
-        body.append(_interp_inner_block(inner, expr, expr.expr.interpolator, siblings))
+        new_body.append(_interp_inner_block(
+            radius_nest, expr, expr.expr.interpolator, siblings))
     if inject_exprs:
         drop = {e: None for e in interp_exprs}
-        body.append(Transformer(drop, nested=True).visit(inner) if drop else inner)
+        new_body.append(Transformer(drop, nested=True).visit(radius_nest)
+                        if drop else radius_nest)
+
+    new_inner_iter = inner_iter._rebuild(
+        nodes=head + temp_exprs + tuple(new_body)
+    )
+    if new_inner_iter is inner_iter:
+        return nest
+    return Transformer({inner_iter: new_inner_iter}, nested=True).visit(nest)
+
 
-    return nest._rebuild(nodes=temp_exprs + tuple(body))
+def _split_kernel_prelude(nodes):
+    """
+    Split the contents of a sparse-Dimension Iteration into the GPU
+    kernel prelude (the thread-index ``ExpressionBundle`` and the
+    optional OOB ``Conditional``) and the remaining body. On non-cuda
+    pipelines the prelude is empty and the full ``nodes`` tuple is the
+    body.
+    """
+    head = ()
+    body = tuple(nodes)
+    if body and isinstance(body[0], ExpressionBundle):
+        head += (body[0],)
+        body = body[1:]
+        if body and isinstance(body[0], Conditional):
+            head += (body[0],)
+            body = body[1:]
+    return head, body
 
 
 def _interp_inner_block(inner, expr, interp, siblings):
@@ -219,10 +270,15 @@ def _interp_inner_block(inner, expr, interp, siblings):
 
     init = Expression(DummyEq(acc, 0))
     inc = Increment(DummyEq(acc, weighted_rhs))
-    # Honour the synthetic Eq's flavour: a SparseInc means the user
-    # asked for ``sf[p_*] += sum`` (interpolation with ``increment=True``);
-    # otherwise the standard write is ``sf[p_*] = sum``.
-    write_back_cls = Increment if eq.is_Reduction else Expression
+    # Honour the synthetic Eq's flavour: an ``IncrInterpolation`` means
+    # the user asked for ``sf[p_*] += sum`` (interpolation with
+    # ``increment=True``); a plain ``Interpolation`` is just ``sf[p_*] =
+    # sum``. We key off the leaf class' ``is_increment_writeback`` flag
+    # rather than ``is_Reduction`` because both flavours are tagged as
+    # reductions (``OpInc``) for dependence-analysis purposes -- the rhs
+    # is implicitly summed over the radius dims -- but only the
+    # ``IncrInterpolation`` flavour writes back with ``+=``.
+    write_back_cls = Increment if eq.is_increment_writeback else Expression
     write_back = write_back_cls(DummyEq(sf_lhs, acc))
 
     # Single substitution: drop siblings, replace ``expr`` with ``inc``.
diff --git a/devito/types/sparse.py b/devito/types/sparse.py
@@ -415,12 +415,17 @@ def dist_origin(self):
     @memoized_meth
     def _crdim(self, dim):
         """
-        The CustomDimension associated with the Dimension `dim` for
-        the radius of the interpolation/injection stencil
+        The CustomDimension associated with the grid Dimension ``dim``
+        for the radius of the interpolation/injection stencil. The
+        parent is ``dim`` itself so that ``_defines`` traces back to the
+        grid Dimension the radius slides over -- this is what dependence
+        analysis needs to recognise the implicit reduction over ``rp_*``
+        rather than treating ``rp_*`` as if they were derived from the
+        SparseFunction's sparse Dimension.
         """
         sname = self._sparse_dim.name
         return CustomDimension(f"r{sname}{dim.name}", -self.r+1,
-                               self.r, 2*self.r, self._sparse_dim)
+                               self.r, 2*self.r, dim)
 
     @memoized_meth
     def _cond_rdim(self, dim, cond):