try turning off SIMD

majosm · majosm · commit b67c5869e8c6 · 2026-05-29T16:48:54.000-05:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -143,6 +143,14 @@ jobs:
                 sudo apt-get update && sudo apt-get install -y valgrind
                 export PYTHONMALLOC=malloc
                 export NO_COLOR=1
+
+                # Test whether disabling the Intel CPU runtime's SIMD vectorizer
+                # avoids the out-of-bounds tail-lane store at its source. The
+                # over-provisioned, bounds-guard-masked work-items only have
+                # their stores leak through the vectorized code path; running
+                # scalar should honor the guard. If valgrind reports no "Invalid
+                # write" here (with no padding), this is a clean global fix.
+                export CL_CONFIG_CPU_VECTORIZER_MODE=1
                 valgrind \
                     --smc-check=all-non-file \
                     --leak-check=no --errors-for-leak-kinds=none \
diff --git a/intel_crash_reproducer.py b/intel_crash_reproducer.py
@@ -41,19 +41,9 @@
 print()
 print(lp.generate_code_v2(knl).device_code())
 
-# Execute the kernel. Allocate the output through the array context's padding
-# allocator, which over-allocates buffers to absorb the Intel CPU runtime's
-# out-of-bounds tail-lane stores. Under valgrind this should turn the previous
-# "Invalid write ... 0 bytes after a block" into a write that lands inside the
-# (padded) block.
-from pyopencl.tools import ImmediateAllocator
-
-from arraycontext.impl.pytato import _PaddedAllocator
-
-
+# Execute the kernel.
 ctx = cl.create_some_context(interactive=False)
 queue = cl.CommandQueue(ctx)
-allocator = _PaddedAllocator(ImmediateAllocator(queue))
 
-_evt, (out,) = knl(queue, allocator=allocator)
+_evt, (out,) = knl(queue)
 print(out.get())