try tweaking parallelization on intel

majosm · majosm · commit edb2253051d1 · 2026-05-29T15:13:42.000-05:00
diff --git a/arraycontext/impl/pytato/__init__.py b/arraycontext/impl/pytato/__init__.py
@@ -990,8 +990,14 @@ def _parallelize_across_device(
             parallelize_disjoint_loop_sets,
         )
 
+        dev = self.queue.device
+        # The Intel CPU OpenCL runtime corrupts the host heap on some kernels
+        # produced by the default parallelization; detect it here so the
+        # parallelization can be tweaked to work around the issue.
+        is_intel_cl = "intel" in dev.platform.name.lower()
+
         t_unit = parallelize_disjoint_loop_sets(
-            t_unit, self.queue.device.max_compute_units)
+            t_unit, dev.max_compute_units, is_intel_cl=is_intel_cl)
 
         # FIXME: Is this something that this abstract-ish
         # PytatoParallelPyOpenCLArrayContext class should be calling, or should it
diff --git a/arraycontext/impl/pytato/parallelize.py b/arraycontext/impl/pytato/parallelize.py
@@ -219,7 +219,8 @@ def split_loop_set_across_work_items(
         callables: CallablesTable,
         loop_set: LoopSet,
         iname_to_approx_length: Mapping[str, float | int],
-        max_device_compute_units: int,
+        max_device_compute_units: int, *,
+        is_intel_cl: bool = False,
 ) -> lp.LoopKernel:
     # Could possibly do something fancier that also includes the individual inner
     # loops in the loop set, but for now just looking at the inames shared between
@@ -260,6 +261,14 @@ def split_loop_set_across_work_items(
                     iname_to_approx_length[iname],
                     -outer_iname_pos[iname])))
 
+    if is_intel_cl:
+        # The Intel CPU OpenCL runtime corrupts the host heap on the 2D-tiled
+        # kernels produced when parallelizing two inames (a work-group axis plus
+        # two work-item axes). Keep only the largest loop (a non-reduction iname
+        # whenever one is present) so we emit the 1D (g.0 + l.0) parallelization,
+        # which the runtime handles correctly.
+        inames_to_parallelize = inames_to_parallelize[-1:]
+
     vng = kernel.get_var_name_generator()
 
     if len(inames_to_parallelize) == 0:
@@ -454,7 +463,8 @@ def split_iteration_domain_across_work_items_for_single_kernel(
         kernel: lp.LoopKernel,
         callables: CallablesTable,
         max_device_compute_units: int, *,
-        single_launch_config: bool = False) -> lp.LoopKernel:
+        single_launch_config: bool = False,
+        is_intel_cl: bool = False) -> lp.LoopKernel:
     if single_launch_config:
         raise NotImplementedError("single_launch_config==True isn't implemented yet.")
 
@@ -467,15 +477,16 @@ def split_iteration_domain_across_work_items_for_single_kernel(
     for loop_set in loop_sets:
         kernel = split_loop_set_across_work_items(
             kernel, callables, loop_set, iname_to_approx_length,
-            max_device_compute_units)
+            max_device_compute_units, is_intel_cl=is_intel_cl)
 
     return kernel
 
 
 def split_iteration_domain_across_work_items(
         t_unit: lp.TranslationUnit,
         max_device_compute_units: int, *,
-        single_launch_config: bool = False) -> lp.TranslationUnit:
+        single_launch_config: bool = False,
+        is_intel_cl: bool = False) -> lp.TranslationUnit:
     """
     Tag inames in *t_unit* with work-group/work-item axes so that each disjoint
     loop set is parallelized across the device. Loops are split based on their
@@ -486,7 +497,8 @@ def split_iteration_domain_across_work_items(
     return split_iteration_domain_across_work_items_for_single_kernel(
         t_unit, t_unit.callables_table,
         max_device_compute_units=max_device_compute_units,
-        single_launch_config=single_launch_config)
+        single_launch_config=single_launch_config,
+        is_intel_cl=is_intel_cl)
 
 # }}}
 
@@ -604,14 +616,15 @@ def add_gbarrier_between_disjoint_loop_sets(
 
 def parallelize_disjoint_loop_sets(
         t_unit: lp.TranslationUnit,
-        max_device_compute_units: int) -> lp.TranslationUnit:
+        max_device_compute_units: int, *,
+        is_intel_cl: bool = False) -> lp.TranslationUnit:
     """
     Parallelize *t_unit* by tagging the inames of each disjoint loop set with
     work-group and work-item axes and enforcing ordering between dependent
     loop sets.
     """
     t_unit = split_iteration_domain_across_work_items(
-        t_unit, max_device_compute_units)
+        t_unit, max_device_compute_units, is_intel_cl=is_intel_cl)
     t_unit = add_gbarrier_between_disjoint_loop_sets(t_unit)
     return t_unit