Add shared fusion infrastructure and QuantFusionPass (pytorch#19724)

ethansfng · web-flow · commit 4de16d0ad243 · 2026-05-29T02:43:41.000Z
Differential Revision: D105728137 Pull Request resolved: pytorch#19724
diff --git a/backends/cadence/aot/compiler_funcs.py b/backends/cadence/aot/compiler_funcs.py
@@ -14,6 +14,7 @@
 import torch
 from torch._inductor.decomposition import remove_decompositions
 from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
 from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e
 from torchao.quantization.pt2e.quantizer import Quantizer
 
@@ -607,3 +608,32 @@ def sink_input_dequant_through_transparent_ops(
         graph_module.recompile()
 
     return modified
+
+
+class QuantFusionPass(PassBase):
+    """
+    Iterates patterns, finds anchor ops in the converted graph, and calls
+    pattern.fuse() to replace dq-op-q subgraphs with fused ops.
+    """
+
+    def __init__(self, patterns: Sequence[object]) -> None:
+        super().__init__()
+        self.patterns = patterns
+
+    def call(self, graph_module: GraphModule) -> Optional[PassResult]:
+        changed = False
+        for pattern in self.patterns:
+            pattern_changed = False
+            for target in pattern.anchor_ops():  # pyre-ignore[16]
+                for node in graph_module.graph.find_nodes(
+                    op="call_function", target=target
+                ):
+                    result = pattern.fuse(graph_module, node)  # pyre-ignore[16]
+                    if result is not None:
+                        changed = True
+                        pattern_changed = True
+            if pattern_changed:
+                graph_module.graph.eliminate_dead_code()
+        if changed:
+            graph_module.recompile()
+        return PassResult(graph_module, changed)
diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
@@ -212,3 +212,20 @@ def nodes_not_adjacent_in_gm(
 def none_throws(x: Optional[PassResult]) -> PassResult:
     assert x is not None
     return x
+
+
+def replace_with_op(
+    gm: torch.fx.GraphModule,
+    insert_after: torch.fx.Node,
+    replacement_op: torch._ops.OpOverload,
+    args: tuple,  # pyre-ignore[2]
+    kwargs: dict,  # pyre-ignore[2]
+    node_to_replace: torch.fx.Node,
+) -> torch.fx.Node:
+    """Insert ``replacement_op`` after ``insert_after`` and replace all uses of
+    ``node_to_replace`` with the new node."""
+    with gm.graph.inserting_after(insert_after):
+        new_node = gm.graph.call_function(replacement_op, args, kwargs)
+    new_node.meta = node_to_replace.meta
+    node_to_replace.replace_all_uses_with(new_node)
+    return new_node
diff --git a/backends/cadence/aot/quantizer/BUCK b/backends/cadence/aot/quantizer/BUCK
@@ -14,6 +14,21 @@ fbcode_target(_kind = runtime.python_library,
     ],
 )
 
+fbcode_target(_kind = runtime.python_library,
+    name = "pattern_utils",
+    srcs = [
+        "pattern_utils.py",
+    ],
+    typing = True,
+    deps = [
+        ":utils",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler_utils",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:utils",
+    ],
+)
+
 fbcode_target(_kind = runtime.python_library,
     name = "patterns",
     srcs = [
diff --git a/backends/cadence/aot/quantizer/pattern_utils.py b/backends/cadence/aot/quantizer/pattern_utils.py
@@ -0,0 +1,207 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import operator
+from typing import Any
+
+import torch
+from executorch.backends.cadence.aot.pass_utils import get_arg, replace_with_op
+from executorch.backends.cadence.aot.quantizer.utils import (
+    copy_node_metadata,
+    create_zero_bias_int32,
+    quantize_tensor_multiplier,
+)
+from executorch.backends.cadence.aot.utils import is_depthwise_conv
+from torch import fx
+from torch._ops import OpOverload
+
+DQ_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.dequantize_per_tensor.default
+Q_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.quantize_per_tensor.default
+
+
+def insert_node_with_meta(
+    gm: fx.GraphModule,
+    op: OpOverload,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any] | None,
+    insert_before: fx.Node,
+    like_node: fx.Node,
+) -> fx.Node:
+    """Create a new node and populate its FakeTensor metadata.
+
+    Inserts ``op(*args, **kwargs)`` before ``insert_before``, runs the op
+    under ``like_node``'s fake_mode to compute ``meta["val"]``, and copies
+    remaining metadata from ``like_node``.
+    """
+    with gm.graph.inserting_before(insert_before):
+        node = gm.graph.call_function(op, args, kwargs or {})
+    assert "val" in like_node.meta
+    fake_mode = like_node.meta["val"].fake_mode
+    assert fake_mode is not None
+
+    def _resolve(x: Any) -> Any:
+        return x.meta["val"] if isinstance(x, fx.Node) else x
+
+    fake_args = tuple(_resolve(a) for a in args)
+    fake_kwargs = {k: _resolve(v) for k, v in (kwargs or {}).items()}
+    with fake_mode:
+        node.meta["val"] = op(*fake_args, **fake_kwargs)
+    copy_node_metadata(node, like_node)
+    return node
+
+
+def find_quant_user(node: fx.Node) -> fx.Node | None:
+    """Find the first quantize_per_tensor user of ``node``, traversing through getitem."""
+    users = list(node.users)
+    if not users:
+        return None
+    user = users[0]
+    if user.target is operator.getitem:
+        if user.args[1] == 0:
+            users = list(user.users)
+            if not users:
+                return None
+            user = users[0]
+        else:
+            return None
+    if user.target == Q_PER_TENSOR:
+        return user
+    return None
+
+
+def fuse_conv(
+    pattern: object,
+    gm: fx.GraphModule,
+    conv_node: fx.Node,
+    dq_input: fx.Node,
+    dq_weight: fx.Node,
+    quant_node: fx.Node,
+) -> fx.Node:
+    """Fuse a dq->conv->q chain into a single quantized conv op."""
+    dq_bias = None
+    if len(conv_node.args) > 2 and conv_node.args[2] is not None:
+        bias_arg = conv_node.args[2]
+        assert isinstance(bias_arg, fx.Node)
+        dq_bias = bias_arg if bias_arg.target == DQ_PER_TENSOR else None
+    weight_scale = get_arg(dq_weight, "scale", float)
+    input_scale = get_arg(dq_input, "scale", float)
+    bias_scale = input_scale * weight_scale
+    if dq_bias is not None:
+        bias_q = get_arg(dq_bias, "input", fx.Node)
+    else:
+        # Cadence quantized conv ops require a non-optional bias argument.
+        weight_node = get_arg(dq_weight, "input", fx.Node)
+        with gm.graph.inserting_before(conv_node):
+            bias_q = create_zero_bias_int32(gm, weight_node, bias_scale)
+    requantize_scale = bias_scale / get_arg(quant_node, "scale", float)
+    requantize_scale_t = torch.tensor([requantize_scale])
+    out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t)
+    args = (
+        get_arg(dq_input, "input", fx.Node),
+        get_arg(dq_weight, "input", fx.Node),
+        bias_q,
+    )
+    groups = get_arg(conv_node, "groups", int)
+    kwargs = {
+        "stride": get_arg(conv_node, "stride", list[int]),
+        "padding": get_arg(conv_node, "padding", list[int]),
+        "dilation": get_arg(conv_node, "dilation", list[int]),
+        "groups": groups,
+        "input_zero_point": get_arg(dq_input, "zero_point", int),
+        "weight_zero_point": get_arg(dq_weight, "zero_point", int),
+        "bias_scale": bias_scale,
+        "out_scale": get_arg(quant_node, "scale", float),
+        "out_zero_point": get_arg(quant_node, "zero_point", int),
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+    }
+    replacement_op = pattern.replacement_op()  # pyre-ignore[16]
+    if replacement_op == torch.ops.cadence.quantized_conv1d_ncl.per_tensor:
+        input_node = get_arg(dq_input, "input", fx.Node)
+        assert len(input_node.meta["val"].shape) >= 2
+        in_channels = input_node.meta["val"].shape[1]
+        if is_depthwise_conv(groups, in_channels):
+            replacement_op = torch.ops.cadence.quantized_depthwise_conv1d_ncl.per_tensor
+    return replace_with_op(gm, conv_node, replacement_op, args, kwargs, quant_node)
+
+
+def fuse_linear(
+    gm: fx.GraphModule,
+    dq_input: fx.Node,
+    dq_weight: fx.Node,
+    dq_bias: fx.Node | None,
+    quant_node: fx.Node,
+    op_node: fx.Node,
+    replacement_op: OpOverload,
+    weight_q: fx.Node | None = None,
+) -> fx.Node:
+    """Fuse a dq->linear->q chain into a single quantized linear op."""
+    assert op_node.target in (
+        torch.ops.aten.linear.default,
+        torch.ops.aten.addmm.default,
+    ), f"Expected linear/addmm, got {op_node.target}"
+    weight_scale = get_arg(dq_weight, "scale", float)
+    input_scale = get_arg(dq_input, "scale", float)
+    bias_scale = input_scale * weight_scale
+    requantize_scale = bias_scale / get_arg(quant_node, "scale", float)
+    requantize_scale_t = torch.tensor([requantize_scale])
+    out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t)
+    if dq_bias is not None:
+        bias_q = get_arg(dq_bias, "input", fx.Node)
+    else:
+        # Cadence quantized linear ops require a non-optional bias argument.
+        weight_node = get_arg(dq_weight, "input", fx.Node)
+        with gm.graph.inserting_before(op_node):
+            bias_q = create_zero_bias_int32(gm, weight_node, bias_scale)
+    final_weight = (
+        weight_q if weight_q is not None else get_arg(dq_weight, "input", fx.Node)
+    )
+    args = (get_arg(dq_input, "input", fx.Node), final_weight, bias_q)
+    kwargs = {
+        "src_zero_point": get_arg(dq_input, "zero_point", int),
+        "weight_zero_point": get_arg(dq_weight, "zero_point", int),
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+        "out_zero_point": get_arg(quant_node, "zero_point", int),
+        "offset": None,
+    }
+    return replace_with_op(gm, op_node, replacement_op, args, kwargs, quant_node)
+
+
+def fuse_matmul(
+    gm: fx.GraphModule,
+    anchor_node: fx.Node,
+    dq0: fx.Node,
+    dq1: fx.Node,
+    quant_node: fx.Node,
+    replacement_op: OpOverload,
+) -> fx.Node:
+    """Fuse a dq->matmul->q chain into a single quantized matmul op."""
+    assert anchor_node.target in (
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.matmul.default,
+    ), f"Expected bmm/matmul, got {anchor_node.target}"
+    scale0 = get_arg(dq0, "scale", float)
+    scale1 = get_arg(dq1, "scale", float)
+    requantize_scale = (scale0 * scale1) / get_arg(quant_node, "scale", float)
+    requantize_scale_t = torch.tensor([requantize_scale])
+    out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t)
+    args = (
+        get_arg(dq0, "input", fx.Node),
+        get_arg(dq0, "zero_point", int),
+        get_arg(dq1, "input", fx.Node),
+        get_arg(dq1, "zero_point", int),
+        None,
+    )
+    kwargs = {
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+        "out_zero_point": get_arg(quant_node, "zero_point", int),
+        "transposed": False,
+    }
+    return replace_with_op(gm, anchor_node, replacement_op, args, kwargs, quant_node)
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -9,7 +9,7 @@
 import operator
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams
@@ -79,6 +79,22 @@ def replacement_op(self) -> OpOverload:
         """
         pass
 
+    def anchor_ops(self) -> tuple[OpOverload, ...]:
+        return tuple(self.partition_types())
+
+    def fuse(
+        self,
+        gm: fx.GraphModule,
+        anchor_node: fx.Node,
+    ) -> Optional[fx.Node]:
+        """Replace the dq→op→q subgraph around ``anchor_node`` with a fused op.
+
+        Called by ``QuantFusionPass`` for each node matching ``anchor_ops()``.
+        Returns the new fused node on success, or ``None`` to skip this match.
+        Subclasses override to implement pattern-specific fusion logic.
+        """
+        return None
+
 
 class AddmmPattern(QuantizationPattern):
     def partition_types(self) -> List[OpOverload]:
diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py
@@ -118,7 +118,9 @@ def create_zero_bias_int32(
     bias_scale: float,
 ) -> fx.Node:
     """
-    Creates a zero bias tensor with the shape of weight[0]
+    Creates a zero bias tensor with the shape of weight[0].
+    Caller is responsible for setting the graph insertion point
+    (e.g. ``with gm.graph.inserting_before(node):``).
     """
     try:
         attr_node = getattr(graph_module, weight_node.target)