try adding padding

majosm · majosm · commit 648760ab2d28 · 2026-05-29T23:29:12.000-05:00
diff --git a/arraycontext/impl/pytato/__init__.py b/arraycontext/impl/pytato/__init__.py
@@ -141,6 +141,33 @@ class _NotOnlyDataWrappers(Exception):  # noqa: N818
     pass
 
 
+class _PaddedAllocator:
+    """Wraps a :mod:`pyopencl` allocator to over-allocate every buffer.
+
+    This works around a bug in the Intel CPU OpenCL runtime: it executes the
+    over-provisioned tail work-items of a partial work-group (those masked off
+    by the kernel's bounds guard) and still commits their global stores, writing
+    past the end of the output buffer and corrupting the host heap. The extra
+    padding gives those stray stores valid memory to land in. Buffers are
+    returned at least as large as requested, so results are unaffected.
+
+    The overrun is a fraction of the data extent, so padding by the requested
+    size covers it; a fixed floor handles buffers small enough that their
+    overrun exceeds their own size. This is a heuristic shield for a runtime
+    bug, not a provably tight bound.
+    """
+
+    def __init__(self, allocator, *, min_pad_bytes: int = 1 << 16) -> None:
+        self._allocator = allocator
+        self._min_pad_bytes = min_pad_bytes
+
+    def __call__(self, nbytes):
+        return self._allocator(nbytes + max(nbytes, self._min_pad_bytes))
+
+    def __getattr__(self, name):
+        return getattr(self._allocator, name)
+
+
 # {{{ _BasePytatoArrayContext
 
 class _BasePytatoArrayContext(ArrayContext, abc.ABC):
@@ -379,8 +406,25 @@ def __init__(
         self.using_svm = None
 
         if allocator is None:
+            import pyopencl as cl
             from pyopencl.characterize import has_coarse_grain_buffer_svm
             has_svm = has_coarse_grain_buffer_svm(queue.device)
+
+            dev = queue.device
+            is_intel_cpu_cl = bool(
+                dev.type & cl.device_type.CPU
+                and "intel" in dev.platform.name.lower())
+
+            if has_svm and is_intel_cpu_cl:
+                # The Intel CPU OpenCL runtime writes past the end of output
+                # buffers (see the padding below), so we over-allocate to absorb
+                # those stray stores. That padding is incompatible with SVM:
+                # pyopencl's enqueue_svm_memcpy requires the source and
+                # destination sizes to match, so an over-allocated SVM array
+                # fails to transfer. Use buffer allocation, which tolerates an
+                # oversized backing buffer, instead.
+                has_svm = False
+
             if has_svm:
                 self.using_svm = True
 
@@ -399,6 +443,13 @@ def __init__(
                 if use_memory_pool:
                     from pyopencl.tools import MemoryPool
                     allocator = MemoryPool(allocator)
+
+            if is_intel_cpu_cl:
+                # The Intel CPU OpenCL runtime writes past the end of the output
+                # buffer when executing the over-provisioned tail of a partial
+                # work-group, corrupting the host heap. Pad allocations so those
+                # stray stores land in valid memory.
+                allocator = _PaddedAllocator(allocator)
         else:
             # Check whether the passed allocator allocates SVM
             try:
diff --git a/test/test_pytato_arraycontext.py b/test/test_pytato_arraycontext.py
@@ -211,7 +211,15 @@ def twice(x):
                         allocator=alloc, use_memory_pool=use_memory_pool)
 
             from pyopencl.tools import ImmediateAllocator, MemoryPool
-            assert isinstance(actx.allocator,
+
+            from arraycontext.impl.pytato import _PaddedAllocator
+            alloc_to_check = actx.allocator
+            if isinstance(alloc_to_check, _PaddedAllocator):
+                # On the Intel CPU runtime the actx wraps its allocator to pad
+                # buffers (working around an out-of-bounds runtime store); check
+                # the wrapped allocator's type.
+                alloc_to_check = alloc_to_check._allocator
+            assert isinstance(alloc_to_check,
                               MemoryPool if use_memory_pool else ImmediateAllocator)
 
             f = actx.compile(twice)
@@ -398,6 +406,26 @@ def twice(x):
         actx2._enable_profiling(True)
 
 
+def _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit):
+    import loopy as lp
+    import pyopencl as cl
+    from pyopencl.tools import ImmediateAllocator
+
+    queue = cl.CommandQueue(cl_ctx)
+    allocator = ImmediateAllocator(queue)
+
+    # The Intel CPU OpenCL runtime writes out of bounds past kernel output
+    # buffers when executing partial work-groups, corrupting the host heap.
+    # auto_test_vs_ref allocates its own buffers, so on that runtime pad them
+    # (via _PaddedAllocator) so the stray stores land in valid memory.
+    dev = cl_ctx.devices[0]
+    if dev.type & cl.device_type.CPU and "intel" in dev.platform.name.lower():
+        from arraycontext.impl.pytato import _PaddedAllocator
+        allocator = _PaddedAllocator(allocator)
+
+    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit, allocator=allocator)
+
+
 def test_parallelize_disjoint_loop_sets_scalar():
     import loopy as lp
     from loopy.kernel.data import GroupInameTag, LocalInameTag
@@ -489,7 +517,7 @@ def test_parallelize_disjoint_loop_sets_single_non_redn_iname():
         == {GroupInameTag(0)}
     assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 def test_parallelize_disjoint_loop_sets_multiple_non_redn_inames():
@@ -531,7 +559,7 @@ def test_parallelize_disjoint_loop_sets_multiple_non_redn_inames():
         == {LocalInameTag(0)}
     assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 def test_parallelize_disjoint_loop_sets_only_redn_iname():
@@ -572,7 +600,7 @@ def test_parallelize_disjoint_loop_sets_only_redn_iname():
         == {GroupInameTag(0)}
     assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 def test_parallelize_disjoint_loop_sets_mixed():
@@ -612,7 +640,7 @@ def test_parallelize_disjoint_loop_sets_mixed():
         == {LocalInameTag(0)}
     assert knl.iname_tags_of_type("k", (GroupInameTag, LocalInameTag)) == set()
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 def test_parallelize_disjoint_loop_sets_multiple_independent_loop_sets():
@@ -678,7 +706,7 @@ def test_parallelize_disjoint_loop_sets_multiple_independent_loop_sets():
                  and insn.synchronization_kind == "global"]
     assert len(gbarriers) == 1
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 def test_parallelize_disjoint_loop_sets_multiple_dependent_loop_sets():
@@ -748,7 +776,7 @@ def test_parallelize_disjoint_loop_sets_multiple_dependent_loop_sets():
     assert gbarrier.id in knl.id_to_insn["loopset2insn1"].depends_on
     assert gbarrier.id in knl.id_to_insn["loopset2insn2"].depends_on
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 def test_alias_global_temporaries():
@@ -807,7 +835,7 @@ def global_temp(name: str):
     assert base_storages["tmp2"] != base_storages["tmp1"]
     assert len(set(base_storages.values())) == 2
 
-    lp.auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
+    _auto_test_vs_ref(ref_t_unit, cl_ctx, t_unit)
 
 
 if __name__ == "__main__":