[Pallas] Indirect gather via one-hot matmul codegen

thcmbs · thcmbs · commit eb0009411564 · 2026-04-17T09:41:52.000Z
diff --git a/helion/_compiler/pallas/plan_tiling.py b/helion/_compiler/pallas/plan_tiling.py
@@ -64,6 +64,17 @@ class NonePattern(IndexingPattern):
     """None index pattern (broadcasting dimension) - allow tiling."""
 
 
+@dataclass
+class IndirectGatherPattern(IndexingPattern):
+    """Pattern for table[idx_tensor, :] where idx_tensor is a runtime tensor.
+
+    Codegen emits one_hot(idx, V) @ table. The table's first dim gets a None
+    BlockSpec (entire table in VMEM, no tiling on that dim).
+    """
+
+    idx_block_id: int | None = None
+
+
 @dataclass
 class DimensionTiling:
     """Tiling decision for a specific dimension of a tensor
@@ -183,6 +194,11 @@ def _detect_indexing_pattern(
 
     if isinstance(idx, torch.fx.Node):
         idx_val = idx.meta.get("val")
+        if isinstance(idx_val, torch.Tensor):
+            idx_block_id: int | None = None
+            if idx_val.ndim >= 1:
+                idx_block_id = env.get_block_id(idx_val.shape[0])
+            return IndirectGatherPattern(idx_block_id=idx_block_id)
         if isinstance(idx_val, torch.SymInt):
             block_id = env.get_block_id(idx_val)
             if block_id is not None:
@@ -270,6 +286,9 @@ def _try_set_tiling_block_id(new_block_id: int) -> None:
     elif isinstance(pattern, NonePattern):
         pass
 
+    elif isinstance(pattern, IndirectGatherPattern):
+        _disallow_tiling()
+
     if isinstance(pattern, (TilePattern, TileBeginWithOffsetPattern)):
         block_size = env.block_sizes[pattern.block_id].from_config(config)
         if isinstance(block_size, int):
diff --git a/helion/language/memory_ops.py b/helion/language/memory_ops.py
@@ -304,6 +304,102 @@ def _pallas_generated_index_code(
     )
 
 
+# Conservative VMEM threshold for gather tables. Emits a clear error
+# instead of a generic Mosaic OOM. Should be replaced with context-aware
+# VMEM budget accounting (e.g. querying actual capacity and other allocations).
+_PALLAS_GATHER_VMEM_THRESHOLD_BYTES = 16 << 20  # 16 MiB
+
+
+def _pallas_indirect_gather_positions(
+    indexing_patterns: list[object],
+) -> list[int]:
+    from .._compiler.pallas.plan_tiling import IndirectGatherPattern
+
+    return [
+        i
+        for i, p in enumerate(indexing_patterns)
+        if isinstance(p, IndirectGatherPattern)
+    ]
+
+
+def _pallas_emit_gather_load(
+    state: CodegenState,
+    tensor: torch.Tensor,
+    subscript: list[object] | tuple[object, ...],
+    indexing_patterns: list[object],
+    indirect_positions: list[int],
+    name: str,
+) -> ast.AST:
+    """Emit a one-hot matmul gather: one_hot(idx, V) @ table."""
+    from .._compiler.pallas.plan_tiling import IndirectGatherPattern
+
+    if len(indirect_positions) > 1:
+        raise NotImplementedError(
+            "Pallas backend: gather with multiple indirect dims is not supported"
+        )
+    indirect_pos = indirect_positions[0]
+    if indirect_pos != 0:
+        raise NotImplementedError(
+            "Pallas backend: indirect gather is only supported on dim 0"
+        )
+    pattern = indexing_patterns[indirect_pos]
+    assert isinstance(pattern, IndirectGatherPattern)
+
+    table_bytes = tensor.numel() * tensor.dtype.itemsize
+    if (
+        isinstance(table_bytes, int)
+        and table_bytes > _PALLAS_GATHER_VMEM_THRESHOLD_BYTES
+    ):
+        raise NotImplementedError(
+            f"Pallas backend: indirect gather requires the full table in VMEM "
+            f"({table_bytes} bytes > {_PALLAS_GATHER_VMEM_THRESHOLD_BYTES} byte "
+            f"threshold). Tile the kernel so the gathered table fits, or use a "
+            f"different access pattern."
+        )
+
+    if not tensor.dtype.is_floating_point:
+        raise NotImplementedError(
+            f"Pallas backend: indirect gather requires a floating-point table, "
+            f"got {tensor.dtype}"
+        )
+
+    vocab_size = tensor.shape[0]
+
+    ast_subscripts = state.ast_args[1]
+    assert isinstance(ast_subscripts, list)
+    ast_idx = ast_subscripts[indirect_pos]
+    assert isinstance(ast_idx, ast.AST)
+    idx_name = state.codegen.lift(ast_idx, dce=False, prefix="index").id
+
+    # Collect none_dims from subscript for expand_dims after the matmul
+    none_dims: list[int] = []
+    for out_pos, idx in enumerate(subscript):
+        if idx is None:
+            none_dims.append(out_pos)
+
+    jnp_dtype = CompileEnvironment.current().backend.dtype_str(tensor.dtype)
+    # TPU MXU requires 32-bit accumulator. For float32 tables we also need
+    # Precision.HIGHEST to prevent MXU from truncating inputs to bfloat16
+    # before multiply-accumulate. For half types the truncation is a no-op.
+    needs_highest = tensor.dtype not in (torch.bfloat16, torch.float16)
+    precision_arg = "precision=jax.lax.Precision.HIGHEST, " if needs_highest else ""
+    result = expr_from_string(
+        f"jax.lax.dot_general("
+        f"jax.nn.one_hot({idx_name}[...], {vocab_size}, dtype=jnp.float32), "
+        f"{name}[...].astype(jnp.float32), "
+        f"(((1,), (0,)), ((), ())), "
+        f"preferred_element_type=jnp.float32, "
+        f"{precision_arg}"
+        f").astype({jnp_dtype})"
+    )
+
+    for dim in none_dims:
+        result = expr_from_string(
+            f"jnp.expand_dims({{result}}, axis={dim})", result=result
+        )
+    return result
+
+
 def _pallas_tile_pattern_code(
     pattern: object,
     idx: object,
@@ -448,6 +544,12 @@ def _(state: CodegenState) -> None:
     device_fn = state.device_function
     device_fn.device_store_index += 1
     device_fn.device_memory_op_index += 1
+    indexing_patterns = _pallas_get_indexing_patterns(state, tensor)
+    if _pallas_indirect_gather_positions(indexing_patterns):
+        # TODO(pallas-scatter): emit one_hot(idx, V).T @ values
+        raise NotImplementedError(
+            "Pallas backend: indirect store (scatter) is not supported"
+        )
     index_str, _ = _pallas_index_str(state, subscript, tensor)
     state.codegen.add_statement(
         statement_from_string(f"{name}[{index_str}] = {{value}}", value=value)
@@ -1507,6 +1609,14 @@ def _(state: CodegenState) -> ast.AST:
     device_fn = state.device_function
     device_fn.device_load_index += 1
     device_fn.device_memory_op_index += 1
+
+    indexing_patterns = _pallas_get_indexing_patterns(state, tensor)
+    indirect_positions = _pallas_indirect_gather_positions(indexing_patterns)
+    if indirect_positions:
+        return _pallas_emit_gather_load(
+            state, tensor, subscript, indexing_patterns, indirect_positions, name
+        )
+
     index_str, none_dims = _pallas_index_str(state, subscript, tensor)
     result = expr_from_string(f"{name}[{index_str}]")
     for dim in none_dims:
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -472,7 +472,6 @@ def test_softmax_two_pass_block_ptr(self):
             indexing="block_ptr",
         )
 
-    @xfailIfPallas("missing BlockSpec for hl.load with computed indices")
     def test_cross_entropy(self):
         n, v = 128, 1000
         logits = torch.randn(n, v, device=DEVICE, dtype=torch.float32)
@@ -673,7 +672,6 @@ def test_rms_norm_bwd(self):
             atol=1e-2,
         )
 
-    @xfailIfPallas("BlockSpec tiling failure")
     def test_embedding_pointers(self):
         args = (
             torch.randint(0, 1024, [8, 128], device=DEVICE, dtype=torch.int32),
@@ -687,7 +685,6 @@ def test_embedding_pointers(self):
             indexing="pointer",
         )
 
-    @xfailIfPallas("BlockSpec tiling failure")
     @patch.object(_compat, "_supports_tensor_descriptor", lambda: False)
     @skipIfTileIR("TileIR does not support block_ptr indexing")
     def test_embedding_block_ptr(self):
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -904,6 +904,72 @@ def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         expected = (x[:, None] < y[None, :]).to(torch.float32)
         torch.testing.assert_close(result, expected)
 
+    @staticmethod
+    def _indirect_gather_kernel():
+        @helion.kernel(backend="pallas", static_shapes=True)
+        def gather(indices: torch.Tensor, table: torch.Tensor) -> torch.Tensor:
+            out = torch.empty(
+                [indices.size(0), table.size(1)],
+                dtype=table.dtype,
+                device=table.device,
+            )
+            for tile_b, tile_e in hl.tile([indices.size(0), table.size(1)]):
+                out[tile_b, tile_e] = table[indices[tile_b], tile_e]
+            return out
+
+        return gather
+
+    def test_indirect_gather_fits_vmem(self) -> None:
+        """Indirect gather emits one_hot matmul."""
+        gather = self._indirect_gather_kernel()
+        table = torch.randn(16, 64, device=DEVICE, dtype=torch.float32)
+        indices = torch.randint(0, 16, (256,), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(gather, (indices, table), block_sizes=[128, 64])
+        self.assertIn("one_hot", code)
+        self.assertIn("HIGHEST", code)
+        expected = table.cpu()[indices.long().cpu()]
+        torch.testing.assert_close(result.cpu(), expected)
+
+    def test_indirect_gather_bf16(self) -> None:
+        """Indirect gather with bf16 table skips HIGHEST precision."""
+        gather = self._indirect_gather_kernel()
+        table = torch.randn(16, 64, device=DEVICE, dtype=torch.bfloat16)
+        indices = torch.randint(0, 16, (256,), device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(gather, (indices, table), block_sizes=[128, 64])
+        self.assertIn("one_hot", code)
+        self.assertIn("astype(jnp.bfloat16)", code)
+        self.assertNotIn("HIGHEST", code)
+        expected = table.cpu()[indices.long().cpu()]
+        torch.testing.assert_close(result.cpu(), expected)
+
+    def test_indirect_gather_too_large_raises(self) -> None:
+        """Indirect gather table over the VMEM threshold raises NotImplementedError."""
+        gather = self._indirect_gather_kernel()
+        # 65537 * 64 * 4 bytes = 16 MiB + 256 bytes, just above the threshold.
+        table = torch.randn(65537, 64, device=DEVICE, dtype=torch.float32)
+        indices = torch.randint(0, 65537, (256,), device=DEVICE, dtype=torch.int32)
+        with self.assertRaisesRegex(
+            Exception, "indirect gather requires the full table"
+        ):
+            code_and_output(gather, (indices, table), block_sizes=[128, 64])
+
+    def test_indirect_store_scatter_raises(self) -> None:
+        """Scatter (indirect store) is rejected with a clear error."""
+
+        @helion.kernel(backend="pallas", static_shapes=True)
+        def scatter(
+            out: torch.Tensor, values: torch.Tensor, indices: torch.Tensor
+        ) -> torch.Tensor:
+            for tile_b, tile_e in hl.tile([values.size(0), values.size(1)]):
+                out[indices[tile_b], tile_e] = values[tile_b, tile_e]
+            return out
+
+        out = torch.zeros(16, 64, device=DEVICE, dtype=torch.float32)
+        values = torch.randn(8, 64, device=DEVICE, dtype=torch.float32)
+        indices = torch.arange(8, device=DEVICE, dtype=torch.int32)
+        with self.assertRaisesRegex(Exception, "indirect store"):
+            code_and_output(scatter, (out, values, indices), block_sizes=[8, 64])
+
 
 if __name__ == "__main__":
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -472,7 +472,6 @@ def test_softmax_two_pass_block_ptr(self):`
`472`	`472`	`indexing="block_ptr",`
`473`	`473`	`)`
`474`	`474`
`475`		`- @xfailIfPallas("missing BlockSpec for hl.load with computed indices")`
`476`	`475`	`def test_cross_entropy(self):`
`477`	`476`	`n, v = 128, 1000`
`478`	`477`	`logits = torch.randn(n, v, device=DEVICE, dtype=torch.float32)`
`@@ -673,7 +672,6 @@ def test_rms_norm_bwd(self):`
`673`	`672`	`atol=1e-2,`
`674`	`673`	`)`
`675`	`674`
`676`		`- @xfailIfPallas("BlockSpec tiling failure")`
`677`	`675`	`def test_embedding_pointers(self):`
`678`	`676`	`args = (`
`679`	`677`	`torch.randint(0, 1024, [8, 128], device=DEVICE, dtype=torch.int32),`
`@@ -687,7 +685,6 @@ def test_embedding_pointers(self):`
`687`	`685`	`indexing="pointer",`
`688`	`686`	`)`
`689`	`687`
`690`		`- @xfailIfPallas("BlockSpec tiling failure")`
`691`	`688`	`@patch.object(_compat, "_supports_tensor_descriptor", lambda: False)`
`692`	`689`	`@skipIfTileIR("TileIR does not support block_ptr indexing")`
`693`	`690`	`def test_embedding_block_ptr(self):`