@@ -141,6 +141,33 @@ class _NotOnlyDataWrappers(Exception): # noqa: N818
141141 pass
142142
143143
144+ class _PaddedAllocator :
145+ """Wraps a :mod:`pyopencl` allocator to over-allocate every buffer.
146+
147+ This works around a bug in the Intel CPU OpenCL runtime: it executes the
148+ over-provisioned tail work-items of a partial work-group (those masked off
149+ by the kernel's bounds guard) and still commits their global stores, writing
150+ past the end of the output buffer and corrupting the host heap. The extra
151+ padding gives those stray stores valid memory to land in. Buffers are
152+ returned at least as large as requested, so results are unaffected.
153+
154+ The overrun is a fraction of the data extent, so padding by the requested
155+ size covers it; a fixed floor handles buffers small enough that their
156+ overrun exceeds their own size. This is a heuristic shield for a runtime
157+ bug, not a provably tight bound.
158+ """
159+
160+ def __init__ (self , allocator , * , min_pad_bytes : int = 1 << 16 ) -> None :
161+ self ._allocator = allocator
162+ self ._min_pad_bytes = min_pad_bytes
163+
164+ def __call__ (self , nbytes ):
165+ return self ._allocator (nbytes + max (nbytes , self ._min_pad_bytes ))
166+
167+ def __getattr__ (self , name ):
168+ return getattr (self ._allocator , name )
169+
170+
144171# {{{ _BasePytatoArrayContext
145172
146173class _BasePytatoArrayContext (ArrayContext , abc .ABC ):
@@ -384,9 +411,11 @@ def __init__(
384411 has_svm = has_coarse_grain_buffer_svm (queue .device )
385412
386413 dev = queue .device
387- if (has_svm
388- and dev .type & cl .device_type .CPU
389- and "intel" in dev .platform .name .lower ()):
414+ is_intel_cpu_cl = bool (
415+ dev .type & cl .device_type .CPU
416+ and "intel" in dev .platform .name .lower ())
417+
418+ if has_svm and is_intel_cpu_cl :
390419 # The Intel CPU OpenCL runtime advertises coarse-grain buffer
391420 # SVM, but its clEnqueueSVMFree corrupts the host heap ("double
392421 # free or corruption"), aborting the process. Fall back to
@@ -411,6 +440,12 @@ def __init__(
411440 if use_memory_pool :
412441 from pyopencl .tools import MemoryPool
413442 allocator = MemoryPool (allocator )
443+
444+ if is_intel_cpu_cl :
445+ # The runtime also writes past the end of the output buffer when
446+ # executing the over-provisioned tail of a partial work-group.
447+ # Pad allocations so those stray stores land in valid memory.
448+ allocator = _PaddedAllocator (allocator )
414449 else :
415450 # Check whether the passed allocator allocates SVM
416451 try :
0 commit comments