Revert "try tweaking parallelization on intel"

majosm · majosm · commit 92876d0fb16d · 2026-05-29T15:32:13.000-05:00
This reverts commit edb2253.
diff --git a/arraycontext/impl/pytato/__init__.py b/arraycontext/impl/pytato/__init__.py
@@ -990,14 +990,8 @@ def _parallelize_across_device(
             parallelize_disjoint_loop_sets,
         )
 
-        dev = self.queue.device
-        # The Intel CPU OpenCL runtime corrupts the host heap on some kernels
-        # produced by the default parallelization; detect it here so the
-        # parallelization can be tweaked to work around the issue.
-        is_intel_cl = "intel" in dev.platform.name.lower()
-
         t_unit = parallelize_disjoint_loop_sets(
-            t_unit, dev.max_compute_units, is_intel_cl=is_intel_cl)
+            t_unit, self.queue.device.max_compute_units)
 
         # FIXME: Is this something that this abstract-ish
         # PytatoParallelPyOpenCLArrayContext class should be calling, or should it
diff --git a/arraycontext/impl/pytato/parallelize.py b/arraycontext/impl/pytato/parallelize.py
@@ -219,8 +219,7 @@ def split_loop_set_across_work_items(
         callables: CallablesTable,
         loop_set: LoopSet,
         iname_to_approx_length: Mapping[str, float | int],
-        max_device_compute_units: int, *,
-        is_intel_cl: bool = False,
+        max_device_compute_units: int,
 ) -> lp.LoopKernel:
     # Could possibly do something fancier that also includes the individual inner
     # loops in the loop set, but for now just looking at the inames shared between
@@ -261,14 +260,6 @@ def split_loop_set_across_work_items(
                     iname_to_approx_length[iname],
                     -outer_iname_pos[iname])))
 
-    if is_intel_cl:
-        # The Intel CPU OpenCL runtime corrupts the host heap on the 2D-tiled
-        # kernels produced when parallelizing two inames (a work-group axis plus
-        # two work-item axes). Keep only the largest loop (a non-reduction iname
-        # whenever one is present) so we emit the 1D (g.0 + l.0) parallelization,
-        # which the runtime handles correctly.
-        inames_to_parallelize = inames_to_parallelize[-1:]
-
     vng = kernel.get_var_name_generator()
 
     if len(inames_to_parallelize) == 0:
@@ -463,8 +454,7 @@ def split_iteration_domain_across_work_items_for_single_kernel(
         kernel: lp.LoopKernel,
         callables: CallablesTable,
         max_device_compute_units: int, *,
-        single_launch_config: bool = False,
-        is_intel_cl: bool = False) -> lp.LoopKernel:
+        single_launch_config: bool = False) -> lp.LoopKernel:
     if single_launch_config:
         raise NotImplementedError("single_launch_config==True isn't implemented yet.")
 
@@ -477,16 +467,15 @@ def split_iteration_domain_across_work_items_for_single_kernel(
     for loop_set in loop_sets:
         kernel = split_loop_set_across_work_items(
             kernel, callables, loop_set, iname_to_approx_length,
-            max_device_compute_units, is_intel_cl=is_intel_cl)
+            max_device_compute_units)
 
     return kernel
 
 
 def split_iteration_domain_across_work_items(
         t_unit: lp.TranslationUnit,
         max_device_compute_units: int, *,
-        single_launch_config: bool = False,
-        is_intel_cl: bool = False) -> lp.TranslationUnit:
+        single_launch_config: bool = False) -> lp.TranslationUnit:
     """
     Tag inames in *t_unit* with work-group/work-item axes so that each disjoint
     loop set is parallelized across the device. Loops are split based on their
@@ -497,8 +486,7 @@ def split_iteration_domain_across_work_items(
     return split_iteration_domain_across_work_items_for_single_kernel(
         t_unit, t_unit.callables_table,
         max_device_compute_units=max_device_compute_units,
-        single_launch_config=single_launch_config,
-        is_intel_cl=is_intel_cl)
+        single_launch_config=single_launch_config)
 
 # }}}
 
@@ -616,15 +604,14 @@ def add_gbarrier_between_disjoint_loop_sets(
 
 def parallelize_disjoint_loop_sets(
         t_unit: lp.TranslationUnit,
-        max_device_compute_units: int, *,
-        is_intel_cl: bool = False) -> lp.TranslationUnit:
+        max_device_compute_units: int) -> lp.TranslationUnit:
     """
     Parallelize *t_unit* by tagging the inames of each disjoint loop set with
     work-group and work-item axes and enforcing ordering between dependent
     loop sets.
     """
     t_unit = split_iteration_domain_across_work_items(
-        t_unit, max_device_compute_units, is_intel_cl=is_intel_cl)
+        t_unit, max_device_compute_units)
     t_unit = add_gbarrier_between_disjoint_loop_sets(t_unit)
     return t_unit