Fold gather(load(t, [..., :, ...]), dim, idx) into direct indirect load

AmesingFlank · AmesingFlank · commit b70396cc39d8 · 2026-06-04T01:37:01.000Z
The cross_entropy pattern (logits[tile_n, :].gather(1, idx[tile_n].unsqueeze(1))) was producing invalid Triton (NameError on the load) when the reduction roller tried to roll the surrounding amax/sum: a _for_loop output can't carry the rdim-shaped logits_rows out to feed the gather sitting outside the loop. Rewrite gather(load(t, [..., :, ...]), dim, idx) at the FX layer to a direct indirect load(t, [..., idx, ...]). The two forms compute the same values, but the direct form skips the wide load entirely — so the rdim-shaped intermediate never exists and the roller's existing logic handles the surrounding reductions naturally. The CuTe backend already does this fold at codegen time (aten_lowering.codegen_gather_cute); lifting it to FX surfaces the same simplification to the Triton backend and the rolling analysis. The fold is gated to the cross_entropy-style pattern: load's dim axis is a full slice, gather index has a singleton at dim and the same rank as the load's subscript, no extra_mask. Other gather shapes go through the existing aten.gather path. After this, examples/cross_entropy.py runs end-to-end: autotuning finds rolled configs (block_sizes=[1], reduction_loops=[16384]) and the kernel is ~3x faster than torch eager. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> stack-info: PR: #2684, branch: AmesingFlank/stack/63
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
@@ -2392,6 +2392,8 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
 
         for graph in device_ir.graphs:
             rewrite_implicit_random_ops(graph.graph)
+        for graph in device_ir.graphs:
+            fold_gather_into_load(graph.graph)
         if CompileEnvironment.current().backend.name == "cute":
             promotions = collect_cute_half_atomic_output_promotions(device_ir.graphs)
             if promotions:
@@ -2626,6 +2628,104 @@ def remove_unnecessary_tile_index(graph: torch.fx.Graph) -> None:
             graph.erase_node(node)
 
 
+def fold_gather_into_load(graph: torch.fx.Graph) -> None:
+    """Rewrite ``gather(load(t, [..., :, ...]), dim, idx)`` to a direct
+    indirect ``load(t, [..., idx, ...])`` that picks the gathered elements
+    in one shot.
+
+    The two forms compute the same values, but the direct form skips the
+    full-axis load that the gather output indexes into.  That matters in
+    two ways:
+
+    * the original load may be too wide to fit Triton's per-tile element
+      cap (the cross_entropy ``logits[tile_n, :]`` case), and
+    * if that axis is the reduction axis, the original load produces an
+      rdim-shaped value that the reduction roller can't carry out of its
+      ``_for_loop`` to feed the gather (the source of the cross_entropy
+      ``NameError: <load> is not defined`` codegen failure).
+
+    Folding the gather away removes both problems before the rolling
+    analysis runs.  The CuTe backend already applies the same fold at
+    codegen time (see ``aten_lowering.codegen_gather_cute``); this pass
+    lifts it to the FX layer so the Triton backend and the roller see the
+    rewritten graph.
+
+    The original load is preserved for any non-gather users (e.g. a
+    sibling reduction over the same axis) — they keep their wide view of
+    the tensor while the gather gets its narrow direct load.
+
+    The fold only fires when:
+
+    * the load's ``dim`` axis is a full ``slice(None)`` (so the gather is
+      genuinely picking one element from a wide axis — for narrow
+      already-indexed axes the fold would produce a different result
+      because Helion's indirect-load pairs tensor indexers elementwise
+      rather than taking a Cartesian product),
+    * the gather index has a singleton at ``dim`` and the same rank as
+      the load's subscript (the cross_entropy
+      ``idx[tile_n].unsqueeze(1)`` pattern).  Other gather index shapes
+      — e.g. one that picks ``K`` elements per row — broadcast
+      differently in Helion's indirect-load codegen, so we leave those
+      for the existing ``aten.gather`` path, and
+    * the original load has no ``extra_mask`` (a mask sized to the wide
+      subscript would no longer match the post-fold narrow shape).
+    """
+    for gather in graph.find_nodes(
+        op="call_function", target=torch.ops.aten.gather.default
+    ):
+        if gather.kwargs or len(gather.args) != 3:
+            continue
+        load_node, dim, index_node = gather.args
+        if not (
+            isinstance(load_node, torch.fx.Node)
+            and isinstance(index_node, torch.fx.Node)
+            and isinstance(dim, int)
+            and load_node.target is hl.load
+        ):
+            continue
+        tensor_node, subscript, *load_tail = load_node.args
+        if not isinstance(subscript, (list, tuple)):
+            continue
+        ndim = len(subscript)
+        if dim < 0:
+            dim += ndim
+        if not (0 <= dim < ndim):
+            continue
+        # Original load's gather axis must be a full slice — otherwise the
+        # fold doesn't preserve semantics (see docstring).
+        if not (isinstance(subscript[dim], slice) and subscript[dim] == slice(None)):
+            continue
+        # Forwarding a non-None ``extra_mask`` sized for the wide load
+        # would mismatch the narrow post-fold shape.  ``eviction_policy``
+        # is just a string and is fine to forward.
+        extra_mask = load_tail[0] if load_tail else None
+        if extra_mask is not None:
+            continue
+        index_val = index_node.meta.get("val")
+        if not (
+            isinstance(index_val, torch.Tensor)
+            and index_val.ndim == ndim
+            and CompileEnvironment.current().size_hint(index_val.size(dim)) == 1
+        ):
+            continue
+        new_subscript = [
+            (index_node if i == dim else s) for i, s in enumerate(subscript)
+        ]
+        with graph.inserting_before(gather):
+            new_load = graph.call_function(
+                hl.load,
+                (tensor_node, new_subscript, *load_tail),
+                {},
+            )
+        # The new load's value matches the gather's shape/dtype, so reuse
+        # gather.meta (val, lowering, etc.) verbatim.
+        new_load.meta.update(gather.meta)
+        gather.replace_all_uses_with(new_load)
+        graph.erase_node(gather)
+        if not load_node.users:
+            graph.erase_node(load_node)
+
+
 def collect_cute_half_atomic_output_promotions(
     graph_infos: list[GraphInfo],
 ) -> dict[str, torch.dtype]:
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -2467,6 +2467,41 @@ def test_gather(
 
         torch.testing.assert_close(result, expected)
 
+    @skipIfTileIR("TileIR does not support gather operation")
+    def test_gather_with_rdim_reduction(self):
+        """torch.gather over the reduction dim of an implicitly rolled load.
+
+        Mirrors the cross_entropy pattern: ``logits[tile_n, :]`` feeds both
+        ``torch.gather(..., 1, idx)`` and ``torch.amax(..., dim=-1)``.  With
+        a rolled reduction, the gather (a non-reduction op consuming the
+        rdim) would live outside the loop and only see the last iteration's
+        chunk — Triton then rejected the generated code with
+        ``NameError: <load> is not defined``.  V is large enough that the
+        heuristic's default config picks a rolled reduction; with the
+        roller's pre-pass refusing to roll this kernel, the default falls
+        back to a persistent reduction that compiles cleanly.
+        """
+
+        @helion.kernel()
+        def gather_then_reduce(
+            x: torch.Tensor,  # [N, V]
+            idx: torch.Tensor,  # [N]
+        ) -> torch.Tensor:  # [N]
+            n, _v = x.shape
+            out = torch.empty([n], dtype=x.dtype, device=x.device)
+            for tile_n in hl.tile(n):
+                row = x[tile_n, :]
+                gathered = row.gather(1, idx[tile_n].unsqueeze(1)).squeeze(1)
+                out[tile_n] = torch.amax(row, dim=-1) - gathered
+            return out
+
+        n, v = 4, 8192
+        x = torch.randn(n, v, device=DEVICE, dtype=torch.float32)
+        idx = torch.randint(0, v, (n,), device=DEVICE, dtype=torch.int64)
+        _, result = code_and_output(gather_then_reduce, (x, idx))
+        expected = torch.amax(x, dim=-1) - x.gather(1, idx[:, None]).squeeze(1)
+        torch.testing.assert_close(result, expected)
+
     @skipIfTileIR("TileIR does not support gather operation")
     def test_gather_2d_dim0(self):
         @helion.kernel()