Skip to content

Commit 648760a

Browse files
committed
try adding padding
1 parent b695525 commit 648760a

2 files changed

Lines changed: 87 additions & 8 deletions

File tree

arraycontext/impl/pytato/__init__.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,33 @@ class _NotOnlyDataWrappers(Exception): # noqa: N818
141141
pass
142142

143143

144+
class _PaddedAllocator:
145+
"""Wraps a :mod:`pyopencl` allocator to over-allocate every buffer.
146+
147+
This works around a bug in the Intel CPU OpenCL runtime: it executes the
148+
over-provisioned tail work-items of a partial work-group (those masked off
149+
by the kernel's bounds guard) and still commits their global stores, writing
150+
past the end of the output buffer and corrupting the host heap. The extra
151+
padding gives those stray stores valid memory to land in. Buffers are
152+
returned at least as large as requested, so results are unaffected.
153+
154+
The overrun is a fraction of the data extent, so padding by the requested
155+
size covers it; a fixed floor handles buffers small enough that their
156+
overrun exceeds their own size. This is a heuristic shield for a runtime
157+
bug, not a provably tight bound.
158+
"""
159+
160+
def __init__(self, allocator, *, min_pad_bytes: int = 1 << 16) -> None:
161+
self._allocator = allocator
162+
self._min_pad_bytes = min_pad_bytes
163+
164+
def __call__(self, nbytes):
165+
return self._allocator(nbytes + max(nbytes, self._min_pad_bytes))
166+
167+
def __getattr__(self, name):
168+
return getattr(self._allocator, name)
169+
170+
144171
# {{{ _BasePytatoArrayContext
145172

146173
class _BasePytatoArrayContext(ArrayContext, abc.ABC):
@@ -379,8 +406,25 @@ def __init__(
379406
self.using_svm = None
380407

381408
if allocator is None:
409+
import pyopencl as cl
382410
from pyopencl.characterize import has_coarse_grain_buffer_svm
383411
has_svm = has_coarse_grain_buffer_svm(queue.device)
412+
413+
dev = queue.device
414+
is_intel_cpu_cl = bool(
415+
dev.type & cl.device_type.CPU
416+
and "intel" in dev.platform.name.lower())
417+
418+
if has_svm and is_intel_cpu_cl:
419+
# The Intel CPU OpenCL runtime writes past the end of output
420+
# buffers (see the padding below), so we over-allocate to absorb
421+
# those stray stores. That padding is incompatible with SVM:
422+
# pyopencl's enqueue_svm_memcpy requires the source and
423+
# destination sizes to match, so an over-allocated SVM array
424+
# fails to transfer. Use buffer allocation, which tolerates an
425+
# oversized backing buffer, instead.
426+
has_svm = False
427+
384428
if has_svm:
385429
self.using_svm = True
386430

@@ -399,6 +443,13 @@ def __init__(
399443
if use_memory_pool:
400444
from pyopencl.tools import MemoryPool
401445
allocator = MemoryPool(allocator)
446+
447+
if is_intel_cpu_cl:
448+
# The Intel CPU OpenCL runtime writes past the end of the output
449+
# buffer when executing the over-provisioned tail of a partial
450+
# work-group, corrupting the host heap. Pad allocations so those
451+
# stray stores land in valid memory.
452+
allocator = _PaddedAllocator(allocator)
402453
else:
403454
# Check whether the passed allocator allocates SVM
404455
try:

test/test_pytato_arraycontext.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,15 @@ def twice(x):
211211
allocator=alloc, use_memory_pool=use_memory_pool)
212212

213213
from pyopencl.tools import ImmediateAllocator, MemoryPool
214-
assert isinstance(actx.allocator,
214+
215+
from arraycontext.impl.pytato import _PaddedAllocator
216+
alloc_to_check = actx.allocator
217+
if isinstance(alloc_to_check, _PaddedAllocator):
218+
# On the Intel CPU runtime the actx wraps its allocator to pad
219+
# buffers (working around an out-of-bounds runtime store); check
220+
# the wrapped allocator's type.
221+
alloc_to_check = alloc_to_check._allocator
222+
assert isinstance(alloc_to_check,
215223
MemoryPool if use_memory_pool else ImmediateAllocator)
216224

217225
f = actx.compile(twice)
@@ -398,6 +406,26 @@ def twice(x):
398406
actx2._enable_profiling(True)
399407

400408

409+
def _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit):
410+
import loopy as lp
411+
import pyopencl as cl
412+
from pyopencl.tools import ImmediateAllocator
413+
414+
queue = cl.CommandQueue(cl_ctx)
415+
allocator = ImmediateAllocator(queue)
416+
417+
# The Intel CPU OpenCL runtime writes out of bounds past kernel output
418+
# buffers when executing partial work-groups, corrupting the host heap.
419+
# auto_test_vs_ref allocates its own buffers, so on that runtime pad them
420+
# (via _PaddedAllocator) so the stray stores land in valid memory.
421+
dev = cl_ctx.devices[0]
422+
if dev.type & cl.device_type.CPU and "intel" in dev.platform.name.lower():
423+
from arraycontext.impl.pytato import _PaddedAllocator
424+
allocator = _PaddedAllocator(allocator)
425+
426+
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit, allocator=allocator)
427+
428+
401429
def test_parallelize_disjoint_loop_sets_scalar():
402430
import loopy as lp
403431
from loopy.kernel.data import GroupInameTag, LocalInameTag
@@ -489,7 +517,7 @@ def test_parallelize_disjoint_loop_sets_single_non_redn_iname():
489517
== {GroupInameTag(0)}
490518
assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
491519

492-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
520+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
493521

494522

495523
def test_parallelize_disjoint_loop_sets_multiple_non_redn_inames():
@@ -531,7 +559,7 @@ def test_parallelize_disjoint_loop_sets_multiple_non_redn_inames():
531559
== {LocalInameTag(0)}
532560
assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
533561

534-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
562+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
535563

536564

537565
def test_parallelize_disjoint_loop_sets_only_redn_iname():
@@ -572,7 +600,7 @@ def test_parallelize_disjoint_loop_sets_only_redn_iname():
572600
== {GroupInameTag(0)}
573601
assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
574602

575-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
603+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
576604

577605

578606
def test_parallelize_disjoint_loop_sets_mixed():
@@ -612,7 +640,7 @@ def test_parallelize_disjoint_loop_sets_mixed():
612640
== {LocalInameTag(0)}
613641
assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
614642

615-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
643+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
616644

617645

618646
def test_parallelize_disjoint_loop_sets_multiple_independent_loop_sets():
@@ -678,7 +706,7 @@ def test_parallelize_disjoint_loop_sets_multiple_independent_loop_sets():
678706
and insn.synchronization_kind == "global"]
679707
assert len(gbarriers) == 1
680708

681-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
709+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
682710

683711

684712
def test_parallelize_disjoint_loop_sets_multiple_dependent_loop_sets():
@@ -748,7 +776,7 @@ def test_parallelize_disjoint_loop_sets_multiple_dependent_loop_sets():
748776
assert gbarrier.id in knl.id_to_insn["loopset2insn1"].depends_on
749777
assert gbarrier.id in knl.id_to_insn["loopset2insn2"].depends_on
750778

751-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
779+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
752780

753781

754782
def test_alias_global_temporaries():
@@ -807,7 +835,7 @@ def global_temp(name: str):
807835
assert base_storages["tmp2"] != base_storages["tmp1"]
808836
assert len(set(base_storages.values())) == 2
809837

810-
lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
838+
_auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
811839

812840

813841
if __name__ == "__main__":

0 commit comments

Comments
 (0)