compiler: Avoid int32 overflow in linearized host-device transfer size

gaoflow · gaoflow · commit 60496a818b87 · 2026-05-29T04:07:24.000+02:00
When a host-device data transfer is linearized, its array section size is emitted as a product of the Function's per-dimension sizes, e.g. `copyin(u[0:u_vec->size[0]*u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])`. The `size[i]` fields are 32-bit C ints, so for a Function with more than ~2**31 elements the product overflows `int` before it is used as the transfer bound, yielding a bogus size and a corrupt/failed device transfer. Cast each factor of the product to a 64-bit integer so the multiplication is carried out in 64-bit arithmetic. Casting the whole product would be too late (the overflow would already have occurred), so each factor is cast individually. Non-product bounds (a single size, an offset, a constant) cannot overflow and are left untouched, as are non-transfer expressions. Fixes #2777
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
@@ -17,7 +17,7 @@
 from devito.passes.iet.langbase import (
     DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask
 )
-from devito.symbolics import INT
+from devito.symbolics import INT, LONG
 from devito.tools import as_tuple, flatten, is_integer, prod
 from devito.types import Symbol
 
@@ -450,6 +450,25 @@ def make_parallel(self, graph, **kwargs):
         return self._make_parallel(graph, sync_mapper=graph.sync_mapper)
 
 
+def _avoid_overflow(expr):
+    """
+    The bounds of a host-device transfer section may be a product of the
+    Function's per-dimension sizes (e.g. ``size[0]*size[1]*size[2]``), as
+    happens when the transferred data is flattened (linearized). These sizes
+    are 32-bit C ints, so for a sufficiently large Function (more than ~2**31
+    elements) the product overflows `int` before it is used as an array bound,
+    producing a bogus transfer size (see issue #2777). Cast each factor of the
+    product to a 64-bit integer so the multiplication is carried out in 64-bit
+    arithmetic. A cast on the whole product would be too late (the overflow
+    would already have happened), hence each factor is cast individually.
+    Non-product bounds (a single size, an offset, a constant) cannot overflow
+    and are left untouched.
+    """
+    if getattr(expr, 'is_Mul', False):
+        return expr.func(*[LONG(a) for a in expr.args])
+    return expr
+
+
 class PragmaTransfer(Pragma, Transfer):
 
     """
@@ -492,7 +511,7 @@ def expr_symbols(self):
     @cached_property
     def _generate(self):
         # Stringify sections
-        sections = ''.join([f'[{ccode(i)}:{ccode(j)}]'
+        sections = ''.join([f'[{ccode(_avoid_overflow(i))}:{ccode(_avoid_overflow(j))}]'
                             for i, j in self.sections])
         arguments = [ccode(i) for i in self.arguments]
         return self.pragma % (self.function.name, sections, *arguments)
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
@@ -245,6 +245,28 @@ def test_linearize(self):
         op.apply(time_M=10)
         assert np.all(u.data[1] == 11)
 
+    def test_linearize_transfer_no_overflow(self):
+        # When a transfer is linearized, its size is a product of the
+        # Function's per-dimension sizes (e.g. `size[0]*size[1]*size[2]`).
+        # These are 32-bit C ints, so for a Function with more than ~2**31
+        # elements the product overflows `int` before being used as the
+        # transfer bound, producing a bogus size (issue #2777). Each factor
+        # must be cast to a 64-bit int so the product is computed in 64-bit.
+        grid = Grid(shape=(4, 5, 6))
+
+        u = TimeFunction(name='u', grid=grid)
+
+        op = Operator(Eq(u.forward, u + 1), opt=('advanced', {'linearize': True}))
+
+        # The transfer bound is a product of the four `size[i]`, each cast to
+        # `long`; the multiplication is thus carried out in 64-bit arithmetic
+        for transfer in op.body.maps + op.body.unmaps:
+            code = transfer.ccode.value
+            for i in range(4):
+                assert f'(long)(u_vec->size[{i}])' in code
+            # No un-cast `size[i]*` product (which would overflow in 32-bit)
+            assert 'u_vec->size[0]*' not in code
+
 
 class TestPassesEdgeCases:
 
diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py
@@ -54,14 +54,15 @@ def test_basic(self):
         assert trees[0][1].pragmas[0].ccode.value ==\
             'omp target teams distribute parallel for collapse(3)'
         assert op.body.maps[0].ccode.value ==\
-            ('omp target enter data map(to: u[0:u_vec->size[0]*'
-             'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
+            ('omp target enter data map(to: u[0:(long)(u_vec->size[0])*'
+             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
         assert op.body.unmaps[0].ccode.value ==\
-            ('omp target update from(u[0:u_vec->size[0]*'
-             'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
+            ('omp target update from(u[0:(long)(u_vec->size[0])*'
+             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
         assert op.body.unmaps[1].ccode.value ==\
-            ('omp target exit data map(release: u[0:u_vec->size[0]*'
-             'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)')
+            ('omp target exit data map(release: u[0:(long)(u_vec->size[0])*'
+             '(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])]) '
+             'if(devicerm)')
 
         # Currently, advanced-fsg mode == advanced mode
         op1 = Operator(Eq(u.forward, u + 1), language='openmp', opt='advanced-fsg')
@@ -125,14 +126,17 @@ def test_multiple_eqns(self):
             'omp target teams distribute parallel for collapse(3)'
         for i, f in enumerate([u, v]):
             assert op.body.maps[i].ccode.value ==\
-                (f'omp target enter data map(to: {f.name}[0:{f.name}_vec->size[0]*'
-                 f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
+                (f'omp target enter data map(to: {f.name}'
+                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
+                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
             assert op.body.unmaps[2*i + 0].ccode.value ==\
-                (f'omp target update from({f.name}[0:{f.name}_vec->size[0]*'
-                 f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
+                (f'omp target update from({f.name}'
+                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
+                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
             assert op.body.unmaps[2*i + 1].ccode.value ==\
-                (f'omp target exit data map(release: {f.name}[0:{f.name}_vec->size[0]*'
-                 f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]]) '
+                (f'omp target exit data map(release: {f.name}'
+                 f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
+                 f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])]) '
                  'if(devicerm)')
 
     def test_multiple_loops(self):