compiler: support mutli-buffering

mloubout · mloubout · commit 00a97bba2bfb · 2026-05-28T11:08:31.000-04:00
diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 
-from sympy import true
+from sympy import Mod, true
 
 from devito.ir import (
     Backward, Forward, GuardBoundNext, PrefetchUpdate, Queue, ReleaseLock, SyncArray,
@@ -78,7 +78,9 @@ def callback(self, clusters, prefix):
             d = self.key0(c0)
             if d is not dim:
                 continue
-
+            print(c0.guards)
+            if d in c0.guards and not c0.guards[d].has(Mod):
+                continue
             protected = self._schedule_waitlocks(c0, d, clusters, locks, syncs)
             self._schedule_withlocks(c0, d, protected, locks, syncs)
 
diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py
@@ -116,7 +116,7 @@ def key(f):
     # Then we inject them into the Clusters. This involves creating the
     # initializing Clusters, and replacing the buffered Functions with the buffers
     clusters = InjectBuffers(mapper, sregistry, options).process(clusters)
-    print(clusters)
+
     return clusters
 
 
@@ -142,22 +142,20 @@ def callback(self, clusters, prefix):
             return clusters
         d = prefix[-1].dim
 
-        def key(f, *args):
-            for (ff, _) in self.mapper:
-                if f == ff:
-                    return True
-            return False
+        key = lambda f, *args: any(f == ff for ff, _ in self.mapper)
         bfmap = map_buffered_functions(clusters, key)
 
         # A BufferDescriptor is a simple data structure storing additional
         # information about a buffer, harvested from the subset of `clusters`
         # that access it
-        descriptors = {b: BufferDescriptor(f, b, bfmap[f], g)
-                       for (f, g), b in self.mapper.items()
-                       if f in bfmap}
+        descriptors = {}
+        for (f, g), b in self.mapper.items():
+            if f in bfmap:
+                descriptors.setdefault(b, []).append(BufferDescriptor(f, b, bfmap[f], g))
 
         # Are we inside the right `d`?
-        descriptors = {b: v for b, v in descriptors.items() if d in v.itdims}
+        descriptors = {b: [vi for vi in v if d in vi.itdims]
+                       for b, v in descriptors.items()}
 
         if not descriptors:
             return clusters
@@ -172,23 +170,28 @@ def key(f, *args):
         # Substitution rules to replace buffered Functions with buffers
         # E.g., `usave[time+1, x+1, y+1] -> ub0[t1, x+1, y+1]`
         subs = {}
-        for b, v in descriptors.items():
-            accesses = chain(*[c.scope[v.f] for c in v.clusters])
-            index_mapper = {i: mds[(v.xd, i)] for i in v.indices}
-            for a in accesses:
-                subs[a.access] = b.indexed[[index_mapper.get(i, i) for i in a]]
+        for b, vb in descriptors.items():
+            for v in vb:
+                for c in v.clusters:
+                    if c.guards.get(d) != v.guards.get(d):
+                        continue
+                    subs.setdefault(c, {})
+                    accesses = c.scope[v.f]
+                    index_mapper = {i: mds[(v.xd, i)] for i in v.indices}
+                    for a in accesses:
+                        subs[c][a.access] = b.indexed[[index_mapper.get(i, i) for i in a]]
 
         processed = []
         for c in clusters:
             # If a buffer is read but never written, then we need to add
             # an Eq to step through the next slot
             # E.g., `ub[0, x] = usave[time+2, x]`
-            for _, v in descriptors.items():
+            for v in chain.from_iterable(descriptors.values()):
                 if not v.is_readonly:
                     continue
                 if c not in v.firstread:
                     continue
-                if not c.guards.get(d) == v.guards.get(d):
+                if c.guards.get(d) != v.guards.get(d):
                     continue
 
                 idxf = v.last_idx[c]
@@ -219,7 +222,7 @@ def key(f, *args):
                 processed.append(Cluster(expr, ispace, guards, properties, syncs))
 
             # Substitute the buffered Functions with the buffers
-            exprs = [uxreplace(e, subs) for e in c.exprs]
+            exprs = [uxreplace(e, subs.get(c, {})) for e in c.exprs]
             ispace = c.ispace.augment(subiters)
             properties = c.properties.sequentialize(d)
             processed.append(
@@ -228,12 +231,12 @@ def key(f, *args):
 
             # Append the copy-back if `c` is the last-write of some buffers
             # E.g., `usave[time+1, x] = ub[t1, x]`
-            for _, v in descriptors.items():
+            for v in chain.from_iterable(descriptors.values()):
                 if v.is_readonly:
                     continue
                 if c not in v.lastwrite:
                     continue
-                if not c.guards.get(d) == v.guards.get(d):
+                if c.guards.get(d) != v.guards.get(d):
                     continue
 
                 idxf = v.last_idx[c]
@@ -269,36 +272,37 @@ def key(f, *args):
         return init + processed
 
     def _optimize(self, clusters, descriptors):
-        for b, v in descriptors.items():
-            if v.is_writeonly:
-                # `b` might be written by multiple, potentially mutually
-                # exclusive, equations. For example, two equations that have or
-                # will have complementary guards, hence only one will be
-                # executed. In such a case, we can split the equations over
-                # separate IterationSpaces
-                key0 = lambda: Stamp()
-            elif v.is_readonly:
-                # `b` is read multiple times -- this could just be the case of
-                # coupled equations, so we more cautiously perform a
-                # "buffer-wise" splitting of the IterationSpaces (i.e., only
-                # relevant if there are at least two read-only buffers)
-                stamp = Stamp()
-                key0 = lambda: stamp  # noqa: B023
-            else:
-                continue
-
-            processed = []
-            for c in clusters:
-                if b not in c.functions:
-                    processed.append(c)
+        for b, vb in descriptors.items():
+            for v in vb:
+                if v.is_writeonly:
+                    # `b` might be written by multiple, potentially mutually
+                    # exclusive, equations. For example, two equations that have or
+                    # will have complementary guards, hence only one will be
+                    # executed. In such a case, we can split the equations over
+                    # separate IterationSpaces
+                    key0 = lambda: Stamp()
+                elif v.is_readonly:
+                    # `b` is read multiple times -- this could just be the case of
+                    # coupled equations, so we more cautiously perform a
+                    # "buffer-wise" splitting of the IterationSpaces (i.e., only
+                    # relevant if there are at least two read-only buffers)
+                    stamp = Stamp()
+                    key0 = lambda: stamp  # noqa: B023
+                else:
                     continue
 
-                key1 = lambda d: not d._defines & v.dim._defines  # noqa: B023
-                dims = c.ispace.project(key1).itdims
-                ispace = c.ispace.lift(dims, key0())
-                processed.append(c.rebuild(ispace=ispace))
+                processed = []
+                for c in clusters:
+                    if b not in c.functions:
+                        processed.append(c)
+                        continue
+
+                    key1 = lambda d: not d._defines & v.dim._defines  # noqa: B023
+                    dims = c.ispace.project(key1).itdims
+                    ispace = c.ispace.lift(dims, key0())
+                    processed.append(c.rebuild(ispace=ispace))
 
-            clusters = processed
+                clusters = processed
 
         return clusters
 
@@ -314,11 +318,11 @@ def _reuse(self, init, clusters, descriptors):
             cbk = lambda v: v
 
         mapper = as_mapper(descriptors, key=lambda b: b._signature)
-        mapper = {k: cbk(v) for k, v in mapper.items() if cbk(v)}
+        mapper = {k: [cbk(v) for v in vb if cbk(v)] for k, vb in mapper.items()}
 
         subs = {}
         drop = set()
-        for reusable in mapper.values():
+        for reusable in chain.from_iterable(mapper.values()):
             retain = reusable.pop(0)
             drop.update(reusable)
 
@@ -365,18 +369,24 @@ def generate_buffers(clusters, key, sregistry, options, **kwargs):
     # {buffered Function -> Buffer}
     xds = {}
     mapper = {}
+    extras = {}
     for f, clusters in bfmap.items():
         for k, ck in groupby(clusters, key=lambda c: c.guards):
+            ck = list(ck)
             exprs = flatten(c.exprs for c in ck)
 
             bdims = key(f, exprs)
 
             dims = [d for d in f.dimensions if d not in bdims]
             if len(dims) != 1:
                 raise CompilationError(f"Unsupported multi-dimensional `buffering` "
-                                    f"required by `{f}`")
+                                       f"required by `{f}`")
             dim = dims.pop()
 
+            if not dim._defines & k.keys():
+                extras.setdefault(f, []).append(k)
+                continue
+
             if is_buffering(exprs):
                 # Multi-level buffering
                 # NOTE: a bit rudimentary (we could go through the exprs one by one
@@ -386,13 +396,15 @@ def generate_buffers(clusters, key, sregistry, options, **kwargs):
                 buffer, = buffers
                 xd = buffer.indices[dim]
             else:
-                size = infer_buffer_size(f, dim, clusters)
+
+                size = infer_buffer_size(f, dim, ck)
 
                 if async_degree is not None:
                     if async_degree < size:
                         warning(
                             'Ignoring provided asynchronous degree as it would be '
-                            f'too small for the required buffer (provided {async_degree}, '
+                            'too small for the required buffer'
+                            f' (provided {async_degree}, '
                             f'but need at least {size} for `{f.name}`)'
                         )
                     else:
@@ -421,6 +433,13 @@ def generate_buffers(clusters, key, sregistry, options, **kwargs):
                                  padding=padding, grid=f.grid, halo=f.halo,
                                  space='mapped', mapped=f, f=f)
 
+    for f, k in extras.items():
+        for (ff, kk) in dict(mapper):
+            if f == ff:
+                for ki in k:
+                    if ki.keys() & set(mapper[(ff, kk)].dimensions):
+                        mapper[(f, ki)] = mapper[(ff, kk)]
+
     return mapper
 
 
@@ -453,7 +472,7 @@ def __init__(self, f, b, clusters, guards):
         self.indices = extract_indices(f, self.dim, clusters)
 
     def __repr__(self):
-        return f"Descriptor[{self.f} -> {self.b}]"
+        return f"Descriptor[{self.f} -> {self.b}], {self.guards}"
 
     @property
     def size(self):
@@ -668,7 +687,7 @@ def make_mds(descriptors, prefix, sregistry):
     inspecting all buffers so that ModuloDimensions are reused when possible.
     """
     mds = defaultdict(int)
-    for v in descriptors.values():
+    for v in chain.from_iterable(descriptors.values()):
         size = v.xd.symbolic_size
 
         if size == 1:
@@ -684,7 +703,6 @@ def make_mds(descriptors, prefix, sregistry):
         # same strategy is also applied in clusters/algorithms/Stepper
         key = lambda i: -np.inf if i - p == 0 else (i - p)  # noqa: B023
         indices = sorted(v.indices, key=key)
-        v_mds = None
 
         for k, i in enumerate(indices):
             k = (v.xd, i)
@@ -711,42 +729,43 @@ def init_buffers(descriptors, options):
     init_onwrite = options['buf-init-onwrite']
 
     init = []
-    for b, v in descriptors.items():
-        f = v.f
-
-        if v.is_read:
-            # Special case: avoid initialization in the case of double (or
-            # multiple) buffering because it's completely unnecessary
-            if v.is_double_buffering:
-                continue
+    for b, vb in descriptors.items():
+        for v in vb:
+            f = v.f
+
+            if v.is_read:
+                # Special case: avoid initialization in the case of double (or
+                # multiple) buffering because it's completely unnecessary
+                if v.is_double_buffering:
+                    continue
 
-            lhs = b.indexify()._subs(v.xd, v.first_idx.b)
-            rhs = f.indexify()._subs(v.dim, v.first_idx.f)
+                lhs = b.indexify()._subs(v.xd, v.first_idx.b)
+                rhs = f.indexify()._subs(v.dim, v.first_idx.f)
 
-        elif v.is_write and init_onwrite(f):
-            lhs = b.indexify()
-            rhs = S.Zero
+            elif v.is_write and init_onwrite(f):
+                lhs = b.indexify()
+                rhs = S.Zero
 
-        else:
-            continue
+            else:
+                continue
 
-        expr = Eq(lhs, rhs)
-        expr = lower_exprs(expr)
+            expr = Eq(lhs, rhs)
+            expr = lower_exprs(expr)
 
-        ispace = v.write_to
+            ispace = v.write_to
 
-        guards = {}
-        guards[None] = GuardBound(v.dim.root.symbolic_min, v.dim.root.symbolic_max)
-        if v.is_read:
-            guards[v.xd] = GuardBound(0, v.first_idx.f)
+            guards = {}
+            guards[None] = GuardBound(v.dim.root.symbolic_min, v.dim.root.symbolic_max)
+            if v.is_read:
+                guards[v.xd] = GuardBound(0, v.first_idx.f)
 
-        properties = Properties()
-        properties = properties.affine(ispace.itdims)
-        properties = properties.parallelize(ispace.itdims)
+            properties = Properties()
+            properties = properties.affine(ispace.itdims)
+            properties = properties.parallelize(ispace.itdims)
 
-        syncs = {None: [InitArray(None, b)]}
+            syncs = {None: [InitArray(None, b)]}
 
-        init.append(Cluster(expr, ispace, guards, properties, syncs))
+            init.append(Cluster(expr, ispace, guards, properties, syncs))
 
     return init