Skip to content

Commit 60496a8

Browse files
committed
compiler: Avoid int32 overflow in linearized host-device transfer size
When a host-device data transfer is linearized, its array section size is emitted as a product of the Function's per-dimension sizes, e.g. `copyin(u[0:u_vec->size[0]*u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])`. The `size[i]` fields are 32-bit C ints, so for a Function with more than ~2**31 elements the product overflows `int` before it is used as the transfer bound, yielding a bogus size and a corrupt/failed device transfer. Cast each factor of the product to a 64-bit integer so the multiplication is carried out in 64-bit arithmetic. Casting the whole product would be too late (the overflow would already have occurred), so each factor is cast individually. Non-product bounds (a single size, an offset, a constant) cannot overflow and are left untouched, as are non-transfer expressions. Fixes #2777
1 parent dc3fa07 commit 60496a8

3 files changed

Lines changed: 59 additions & 14 deletions

File tree

devito/passes/iet/parpragma.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from devito.passes.iet.langbase import (
1818
DeviceAwareMixin, LangBB, LangTransformer, ShmTransformer, make_sections_from_imask
1919
)
20-
from devito.symbolics import INT
20+
from devito.symbolics import INT, LONG
2121
from devito.tools import as_tuple, flatten, is_integer, prod
2222
from devito.types import Symbol
2323

@@ -450,6 +450,25 @@ def make_parallel(self, graph, **kwargs):
450450
return self._make_parallel(graph, sync_mapper=graph.sync_mapper)
451451

452452

453+
def _avoid_overflow(expr):
454+
"""
455+
The bounds of a host-device transfer section may be a product of the
456+
Function's per-dimension sizes (e.g. ``size[0]*size[1]*size[2]``), as
457+
happens when the transferred data is flattened (linearized). These sizes
458+
are 32-bit C ints, so for a sufficiently large Function (more than ~2**31
459+
elements) the product overflows `int` before it is used as an array bound,
460+
producing a bogus transfer size (see issue #2777). Cast each factor of the
461+
product to a 64-bit integer so the multiplication is carried out in 64-bit
462+
arithmetic. A cast on the whole product would be too late (the overflow
463+
would already have happened), hence each factor is cast individually.
464+
Non-product bounds (a single size, an offset, a constant) cannot overflow
465+
and are left untouched.
466+
"""
467+
if getattr(expr, 'is_Mul', False):
468+
return expr.func(*[LONG(a) for a in expr.args])
469+
return expr
470+
471+
453472
class PragmaTransfer(Pragma, Transfer):
454473

455474
"""
@@ -492,7 +511,7 @@ def expr_symbols(self):
492511
@cached_property
493512
def _generate(self):
494513
# Stringify sections
495-
sections = ''.join([f'[{ccode(i)}:{ccode(j)}]'
514+
sections = ''.join([f'[{ccode(_avoid_overflow(i))}:{ccode(_avoid_overflow(j))}]'
496515
for i, j in self.sections])
497516
arguments = [ccode(i) for i in self.arguments]
498517
return self.pragma % (self.function.name, sections, *arguments)

tests/test_gpu_common.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,28 @@ def test_linearize(self):
245245
op.apply(time_M=10)
246246
assert np.all(u.data[1] == 11)
247247

248+
def test_linearize_transfer_no_overflow(self):
249+
# When a transfer is linearized, its size is a product of the
250+
# Function's per-dimension sizes (e.g. `size[0]*size[1]*size[2]`).
251+
# These are 32-bit C ints, so for a Function with more than ~2**31
252+
# elements the product overflows `int` before being used as the
253+
# transfer bound, producing a bogus size (issue #2777). Each factor
254+
# must be cast to a 64-bit int so the product is computed in 64-bit.
255+
grid = Grid(shape=(4, 5, 6))
256+
257+
u = TimeFunction(name='u', grid=grid)
258+
259+
op = Operator(Eq(u.forward, u + 1), opt=('advanced', {'linearize': True}))
260+
261+
# The transfer bound is a product of the four `size[i]`, each cast to
262+
# `long`; the multiplication is thus carried out in 64-bit arithmetic
263+
for transfer in op.body.maps + op.body.unmaps:
264+
code = transfer.ccode.value
265+
for i in range(4):
266+
assert f'(long)(u_vec->size[{i}])' in code
267+
# No un-cast `size[i]*` product (which would overflow in 32-bit)
268+
assert 'u_vec->size[0]*' not in code
269+
248270

249271
class TestPassesEdgeCases:
250272

tests/test_gpu_openmp.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,15 @@ def test_basic(self):
5454
assert trees[0][1].pragmas[0].ccode.value ==\
5555
'omp target teams distribute parallel for collapse(3)'
5656
assert op.body.maps[0].ccode.value ==\
57-
('omp target enter data map(to: u[0:u_vec->size[0]*'
58-
'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
57+
('omp target enter data map(to: u[0:(long)(u_vec->size[0])*'
58+
'(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
5959
assert op.body.unmaps[0].ccode.value ==\
60-
('omp target update from(u[0:u_vec->size[0]*'
61-
'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]])')
60+
('omp target update from(u[0:(long)(u_vec->size[0])*'
61+
'(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])])')
6262
assert op.body.unmaps[1].ccode.value ==\
63-
('omp target exit data map(release: u[0:u_vec->size[0]*'
64-
'u_vec->size[1]*u_vec->size[2]*u_vec->size[3]]) if(devicerm)')
63+
('omp target exit data map(release: u[0:(long)(u_vec->size[0])*'
64+
'(long)(u_vec->size[1])*(long)(u_vec->size[2])*(long)(u_vec->size[3])]) '
65+
'if(devicerm)')
6566

6667
# Currently, advanced-fsg mode == advanced mode
6768
op1 = Operator(Eq(u.forward, u + 1), language='openmp', opt='advanced-fsg')
@@ -125,14 +126,17 @@ def test_multiple_eqns(self):
125126
'omp target teams distribute parallel for collapse(3)'
126127
for i, f in enumerate([u, v]):
127128
assert op.body.maps[i].ccode.value ==\
128-
(f'omp target enter data map(to: {f.name}[0:{f.name}_vec->size[0]*'
129-
f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
129+
(f'omp target enter data map(to: {f.name}'
130+
f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
131+
f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
130132
assert op.body.unmaps[2*i + 0].ccode.value ==\
131-
(f'omp target update from({f.name}[0:{f.name}_vec->size[0]*'
132-
f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]])')
133+
(f'omp target update from({f.name}'
134+
f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
135+
f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])])')
133136
assert op.body.unmaps[2*i + 1].ccode.value ==\
134-
(f'omp target exit data map(release: {f.name}[0:{f.name}_vec->size[0]*'
135-
f'{f.name}_vec->size[1]*{f.name}_vec->size[2]*{f.name}_vec->size[3]]) '
137+
(f'omp target exit data map(release: {f.name}'
138+
f'[0:(long)({f.name}_vec->size[0])*(long)({f.name}_vec->size[1])*'
139+
f'(long)({f.name}_vec->size[2])*(long)({f.name}_vec->size[3])]) '
136140
'if(devicerm)')
137141

138142
def test_multiple_loops(self):

0 commit comments

Comments
 (0)