[Pallas] Lower aten gather using one_hot + sum for TPU compatibility, unblocking cross_entropy

AmesingFlank · AmesingFlank · commit 55f875b936b3 · 2026-06-04T01:05:48.000Z
TPU Mosaic has very limited lax.gather support, so jnp.take_along_axis fails during lowering. Instead, implement gather(input, dim, index) as: mask = one_hot(index.squeeze(dim), input.shape[dim], dtype=input.dtype) result = sum(input * mask, axis=dim, keepdims=True) Also removes the xfailIfPallas mark from test_cross_entropy since the gather lowering now works. Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com> stack-info: PR: #2060, branch: AmesingFlank/stack/26
diff --git a/helion/_compiler/aten_lowering.py b/helion/_compiler/aten_lowering.py
@@ -2511,6 +2511,81 @@ def codegen_gather(ctx: LoweringContext, node: Node) -> object:
     return expr_from_string(result_var)
 
 
+@gather_lowering.register_codegen("pallas")
+def codegen_gather_pallas(ctx: LoweringContext, node: Node) -> object:
+    """Generate gather for Pallas using one_hot + multiply + sum.
+
+    TPU Mosaic has limited lax.gather support, so we implement
+    gather(input, dim, index) as:
+        mask = one_hot(index.squeeze(dim), input.shape[dim], dtype=input.dtype)
+        result = sum(input * mask, axis=dim, keepdims=True)
+    """
+    assert not node.kwargs, "gather does not support keyword arguments"
+    assert len(node.args) == 3, f"gather expects 3 arguments, got {len(node.args)}"
+
+    input_node = node.args[0]
+    dim = node.args[1]
+    index_node = node.args[2]
+
+    assert isinstance(input_node, Node), "gather input must be a Node"
+    assert isinstance(dim, int), f"gather dim must be int, got {type(dim)}"
+    assert isinstance(index_node, Node), "gather index must be a Node"
+
+    input_tensor = input_node.meta["val"]
+    assert isinstance(input_tensor, torch.Tensor), (
+        f"gather input must be a tensor, got {type(input_tensor)}"
+    )
+
+    ndim = input_tensor.ndim
+    if dim < 0:
+        dim = ndim + dim
+    assert 0 <= dim < ndim, (
+        f"gather dim {dim} out of range for tensor with {ndim} dimensions"
+    )
+
+    fn = ctx.cg.device_function
+
+    input_ast = _env_arg(ctx, input_node)
+    assert isinstance(input_ast, ast.AST)
+
+    index_ast = _env_arg(ctx, index_node)
+    assert isinstance(index_ast, ast.AST)
+
+    idx_var = fn.new_var("gather_idx")
+    mask_var = fn.new_var("gather_mask")
+    result_var = fn.new_var("gather_result")
+
+    ctx.cg.add_statement(
+        statement_from_string(
+            f"{idx_var} = jnp.squeeze({{index}}.astype(jnp.int32), axis={dim})",
+            index=index_ast,
+        )
+    )
+
+    ctx.cg.add_statement(
+        statement_from_string(
+            f"{mask_var} = jax.nn.one_hot({idx_var}, {{input}}.shape[{dim}], dtype={{input}}.dtype)",
+            input=input_ast,
+        )
+    )
+
+    if dim != ndim - 1:
+        ctx.cg.add_statement(
+            statement_from_string(
+                f"{mask_var} = jnp.moveaxis({mask_var}, -1, {dim})",
+            )
+        )
+
+    ctx.cg.add_statement(
+        statement_from_string(
+            f"{result_var} = jnp.sum({{input}} * {mask_var}, axis={dim}, keepdims=True)",
+            input=input_ast,
+        )
+    )
+
+    return expr_from_string(result_var)
+
+
 @gather_lowering.register_codegen("cute")
 def codegen_gather_cute(ctx: LoweringContext, node: Node) -> object:
     assert not node.kwargs, "gather does not support keyword arguments"
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -3466,6 +3466,101 @@ def k(x: torch.Tensor) -> torch.Tensor:
         inner_min = spec.block_sizes[1].min_size
         self.assertGreaterEqual(outer_min, inner_min)
 
+        def test_gather_2d_dim_1(self) -> None:
+        @helion.kernel(
+            backend="pallas",
+            static_shapes=True,
+            ignore_warnings=[helion.exc.TensorOperationInWrapper],
+        )
+        def fn(x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+            n, _v = x.shape
+            out = torch.zeros([n, 1], dtype=x.dtype, device=x.device)
+            for tile_n in hl.tile(n):
+                out[tile_n, :] = x[tile_n, :].gather(1, idx[tile_n, :])
+            return out
+
+        x = torch.randn(64, 256, device=DEVICE, dtype=torch.float32)
+        idx = torch.randint(0, 256, (64, 1), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(fn, (x, idx), block_size=64)
+        expected = x.gather(1, idx.long())
+        torch.testing.assert_close(result, expected)
+
+    def test_gather_2d_dim_0(self) -> None:
+        @helion.kernel(
+            backend="pallas",
+            static_shapes=True,
+            ignore_warnings=[helion.exc.TensorOperationInWrapper],
+        )
+        def fn(x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+            _n, m = x.shape
+            out = torch.zeros([1, m], dtype=x.dtype, device=x.device)
+            for tile_m in hl.tile(m):
+                out[:, tile_m] = x[:, tile_m].gather(0, idx[:, tile_m])
+            return out
+
+        x = torch.randn(128, 64, device=DEVICE, dtype=torch.float32)
+        idx = torch.randint(0, 128, (1, 64), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(fn, (x, idx), block_size=64)
+        expected = x.gather(0, idx.long())
+        torch.testing.assert_close(result, expected)
+
+    def test_gather_3d_dim_0(self) -> None:
+        @helion.kernel(
+            backend="pallas",
+            static_shapes=True,
+            ignore_warnings=[helion.exc.TensorOperationInWrapper],
+        )
+        def fn(x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+            _n, m, k = x.shape
+            out = torch.zeros([1, m, k], dtype=x.dtype, device=x.device)
+            for tile_m in hl.tile(m):
+                out[:, tile_m, :] = x[:, tile_m, :].gather(0, idx[:, tile_m, :])
+            return out
+
+        x = torch.randn(32, 16, 8, device=DEVICE, dtype=torch.float32)
+        idx = torch.randint(0, 32, (1, 16, 8), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(fn, (x, idx), block_size=16)
+        expected = x.gather(0, idx.long())
+        torch.testing.assert_close(result, expected)
+
+    def test_gather_3d_dim_1(self) -> None:
+        @helion.kernel(
+            backend="pallas",
+            static_shapes=True,
+            ignore_warnings=[helion.exc.TensorOperationInWrapper],
+        )
+        def fn(x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+            n, _m, k = x.shape
+            out = torch.zeros([n, 1, k], dtype=x.dtype, device=x.device)
+            for tile_n in hl.tile(n):
+                out[tile_n, :, :] = x[tile_n, :, :].gather(1, idx[tile_n, :, :])
+            return out
+
+        x = torch.randn(16, 32, 8, device=DEVICE, dtype=torch.float32)
+        idx = torch.randint(0, 32, (16, 1, 8), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(fn, (x, idx), block_size=16)
+        expected = x.gather(1, idx.long())
+        torch.testing.assert_close(result, expected)
+
+    def test_gather_3d_dim_2(self) -> None:
+        @helion.kernel(
+            backend="pallas",
+            static_shapes=True,
+            ignore_warnings=[helion.exc.TensorOperationInWrapper],
+        )
+        def fn(x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+            n, m, _k = x.shape
+            out = torch.zeros([n, m, 1], dtype=x.dtype, device=x.device)
+            for tile_n in hl.tile(n):
+                out[tile_n, :, :] = x[tile_n, :, :].gather(2, idx[tile_n, :, :])
+            return out
+
+        x = torch.randn(16, 8, 64, device=DEVICE, dtype=torch.float32)
+        idx = torch.randint(0, 64, (16, 8, 1), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(fn, (x, idx), block_size=16)
+        expected = x.gather(2, idx.long())
+        torch.testing.assert_close(result, expected)
+
 
 @skipUnlessPallas("JAX/Pallas TPU not available")
 class TestPallasIndirectGather(TestCase):