Fix rewrite pattern

haijieg · haijieg · commit 5ade61c9b727 · 2026-01-29T09:31:43.000-08:00
Skip pattern rewrite if removed variable is used by the new operation

Signed-off-by: Jay Gu &lt;jagu@nvidia.com&gt;
diff --git a/changelog.d/fix-rewrite-pattern.md b/changelog.d/fix-rewrite-pattern.md
@@ -0,0 +1,4 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2025> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- Fixed a bug when pattern match attempted to remove a value that is used by the new operation
diff --git a/src/cuda/tile/_passes/rewrite_patterns.py b/src/cuda/tile/_passes/rewrite_patterns.py
@@ -150,6 +150,11 @@ def rewrite_patterns(root_block: Block):
             # External use -- can't rewrite
             continue
 
+        new_inputs = set(v.name for op in r.to_add for v in op.all_inputs())
+        if deleted_results & new_inputs:
+            # New operations use deleted results -- can't rewrite
+            continue
+
         # For now, we insert the new operations at the location of the last matched op.
         # This is not always correct for maintaining topological sorting, in case if matches
         # have multiple outputs. However, currently we only care about rewriting subgraphs
diff --git a/test/test_fma.py b/test/test_fma.py
@@ -56,6 +56,28 @@ def add_mul_kernel(x, y, z, output,
     ct.store(output, index=(bidx, 0), tile=output_tile)
 
 
+@ct.kernel
+def mul_add_same_operand_kernel(x, output,
+                                TILE: ct.Constant[int],
+                                DIM: ct.Constant[int]):
+    bidx = ct.bid(0)
+    tx = ct.load(x, index=(bidx, 0), shape=(TILE, DIM))
+    tmp = tx * tx
+    output_tile = tmp + tmp
+    ct.store(output, index=(bidx, 0), tile=output_tile)
+
+
+def test_fma_skip_when_new_op_uses_deleted_var():
+    shape = (128, 32)
+    x = make_tensor(shape, dtype=torch.float32, device='cuda')
+    output = make_tensor(shape, dtype=torch.float32, device='cuda')
+    TILE = 32
+    grid = (ceil(shape[0] / TILE), 1, 1)
+    ct.launch(torch.cuda.current_stream(), grid, mul_add_same_operand_kernel,
+              (x, output, TILE, shape[1]))
+    assert_close(output, 2 * x * x, atol=1e-3, rtol=1e-3)
+
+
 @pytest.mark.use_mlir
 @pytest.mark.parametrize(
     "kernel, kernel_ref",