Skip to content

Commit fbd4d23

Browse files
committed
try padding allocations
1 parent a6981eb commit fbd4d23

2 files changed

Lines changed: 50 additions & 5 deletions

File tree

arraycontext/impl/pytato/__init__.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,33 @@ class _NotOnlyDataWrappers(Exception): # noqa: N818
141141
pass
142142

143143

144+
class _PaddedAllocator:
145+
"""Wraps a :mod:`pyopencl` allocator to over-allocate every buffer.
146+
147+
This works around a bug in the Intel CPU OpenCL runtime: it executes the
148+
over-provisioned tail work-items of a partial work-group (those masked off
149+
by the kernel's bounds guard) and still commits their global stores, writing
150+
past the end of the output buffer and corrupting the host heap. The extra
151+
padding gives those stray stores valid memory to land in. Buffers are
152+
returned at least as large as requested, so results are unaffected.
153+
154+
The overrun is a fraction of the data extent, so padding by the requested
155+
size covers it; a fixed floor handles buffers small enough that their
156+
overrun exceeds their own size. This is a heuristic shield for a runtime
157+
bug, not a provably tight bound.
158+
"""
159+
160+
def __init__(self, allocator, *, min_pad_bytes: int = 1 << 16) -> None:
161+
self._allocator = allocator
162+
self._min_pad_bytes = min_pad_bytes
163+
164+
def __call__(self, nbytes):
165+
return self._allocator(nbytes + max(nbytes, self._min_pad_bytes))
166+
167+
def __getattr__(self, name):
168+
return getattr(self._allocator, name)
169+
170+
144171
# {{{ _BasePytatoArrayContext
145172

146173
class _BasePytatoArrayContext(ArrayContext, abc.ABC):
@@ -384,9 +411,11 @@ def __init__(
384411
has_svm = has_coarse_grain_buffer_svm(queue.device)
385412

386413
dev = queue.device
387-
if (has_svm
388-
and dev.type & cl.device_type.CPU
389-
and "intel" in dev.platform.name.lower()):
414+
is_intel_cpu_cl = bool(
415+
dev.type & cl.device_type.CPU
416+
and "intel" in dev.platform.name.lower())
417+
418+
if has_svm and is_intel_cpu_cl:
390419
# The Intel CPU OpenCL runtime advertises coarse-grain buffer
391420
# SVM, but its clEnqueueSVMFree corrupts the host heap ("double
392421
# free or corruption"), aborting the process. Fall back to
@@ -411,6 +440,12 @@ def __init__(
411440
if use_memory_pool:
412441
from pyopencl.tools import MemoryPool
413442
allocator = MemoryPool(allocator)
443+
444+
if is_intel_cpu_cl:
445+
# The runtime also writes past the end of the output buffer when
446+
# executing the over-provisioned tail of a partial work-group.
447+
# Pad allocations so those stray stores land in valid memory.
448+
allocator = _PaddedAllocator(allocator)
414449
else:
415450
# Check whether the passed allocator allocates SVM
416451
try:

intel_crash_reproducer.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,19 @@
4141
print()
4242
print(lp.generate_code_v2(knl).device_code())
4343

44-
# Execute the kernel.
44+
# Execute the kernel. Allocate the output through the array context's padding
45+
# allocator, which over-allocates buffers to absorb the Intel CPU runtime's
46+
# out-of-bounds tail-lane stores. Under valgrind this should turn the previous
47+
# "Invalid write ... 0 bytes after a block" into a write that lands inside the
48+
# (padded) block.
49+
from pyopencl.tools import ImmediateAllocator
50+
51+
from arraycontext.impl.pytato import _PaddedAllocator
52+
53+
4554
ctx = cl.create_some_context(interactive=False)
4655
queue = cl.CommandQueue(ctx)
56+
allocator = _PaddedAllocator(ImmediateAllocator(queue))
4757

48-
_evt, (out,) = knl(queue)
58+
_evt, (out,) = knl(queue, allocator=allocator)
4959
print(out.get())

0 commit comments

Comments
 (0)