try padding allocations

majosm · majosm · commit fbd4d2371d9c · 2026-05-29T16:28:18.000-05:00
diff --git a/arraycontext/impl/pytato/__init__.py b/arraycontext/impl/pytato/__init__.py
@@ -141,6 +141,33 @@ class _NotOnlyDataWrappers(Exception):  # noqa: N818
     pass
 
 
+class _PaddedAllocator:
+    """Wraps a :mod:`pyopencl` allocator to over-allocate every buffer.
+
+    This works around a bug in the Intel CPU OpenCL runtime: it executes the
+    over-provisioned tail work-items of a partial work-group (those masked off
+    by the kernel's bounds guard) and still commits their global stores, writing
+    past the end of the output buffer and corrupting the host heap. The extra
+    padding gives those stray stores valid memory to land in. Buffers are
+    returned at least as large as requested, so results are unaffected.
+
+    The overrun is a fraction of the data extent, so padding by the requested
+    size covers it; a fixed floor handles buffers small enough that their
+    overrun exceeds their own size. This is a heuristic shield for a runtime
+    bug, not a provably tight bound.
+    """
+
+    def __init__(self, allocator, *, min_pad_bytes: int = 1 << 16) -> None:
+        self._allocator = allocator
+        self._min_pad_bytes = min_pad_bytes
+
+    def __call__(self, nbytes):
+        return self._allocator(nbytes + max(nbytes, self._min_pad_bytes))
+
+    def __getattr__(self, name):
+        return getattr(self._allocator, name)
+
+
 # {{{ _BasePytatoArrayContext
 
 class _BasePytatoArrayContext(ArrayContext, abc.ABC):
@@ -384,9 +411,11 @@ def __init__(
             has_svm = has_coarse_grain_buffer_svm(queue.device)
 
             dev = queue.device
-            if (has_svm
-                    and dev.type & cl.device_type.CPU
-                    and "intel" in dev.platform.name.lower()):
+            is_intel_cpu_cl = bool(
+                dev.type & cl.device_type.CPU
+                and "intel" in dev.platform.name.lower())
+
+            if has_svm and is_intel_cpu_cl:
                 # The Intel CPU OpenCL runtime advertises coarse-grain buffer
                 # SVM, but its clEnqueueSVMFree corrupts the host heap ("double
                 # free or corruption"), aborting the process. Fall back to
@@ -411,6 +440,12 @@ def __init__(
                 if use_memory_pool:
                     from pyopencl.tools import MemoryPool
                     allocator = MemoryPool(allocator)
+
+            if is_intel_cpu_cl:
+                # The runtime also writes past the end of the output buffer when
+                # executing the over-provisioned tail of a partial work-group.
+                # Pad allocations so those stray stores land in valid memory.
+                allocator = _PaddedAllocator(allocator)
         else:
             # Check whether the passed allocator allocates SVM
             try:
diff --git a/intel_crash_reproducer.py b/intel_crash_reproducer.py
@@ -41,9 +41,19 @@
 print()
 print(lp.generate_code_v2(knl).device_code())
 
-# Execute the kernel.
+# Execute the kernel. Allocate the output through the array context's padding
+# allocator, which over-allocates buffers to absorb the Intel CPU runtime's
+# out-of-bounds tail-lane stores. Under valgrind this should turn the previous
+# "Invalid write ... 0 bytes after a block" into a write that lands inside the
+# (padded) block.
+from pyopencl.tools import ImmediateAllocator
+
+from arraycontext.impl.pytato import _PaddedAllocator
+
+
 ctx = cl.create_some_context(interactive=False)
 queue = cl.CommandQueue(ctx)
+allocator = _PaddedAllocator(ImmediateAllocator(queue))
 
-_evt, (out,) = knl(queue)
+_evt, (out,) = knl(queue, allocator=allocator)
 print(out.get())