NXP backend: added support for aten.bmm (#17818)

novak-vaclav · web-flow · commit 763cdd1fad62 · 2026-03-30T07:24:33.000+02:00
### Summary adds support for `aten.bmm` operator. The original PR is [here](#17670), however I pushed to the branch without committing the work first and the PR closed itself auto-magically. ### Test plan tests can be manually run using `pytest -c /dev/null backends/nxp/tests/`
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
@@ -32,6 +32,7 @@
     exir_ops.edge.aten.addmm.default: AddMMConverter,  # noqa F405
     exir_ops.edge.aten.add.Tensor: AddTensorConverter,  # noqa F405
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.bmm.default: BMMConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clamp.default: ClampConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -13,6 +13,9 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.avg_pool_2d_converter import (
     AvgPool2dConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.bmm_converter import (
+    BMMConverter,
+)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.cat_converter import (
     CatConverter,
 )
@@ -99,6 +102,7 @@
     "AddMMConverter",
     "AddTensorConverter",
     "AvgPool2dConverter",
+    "BMMConverter",
     "CatConverter",
     "ClampConverter",
     "CloneConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/bmm_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/bmm_converter.py
@@ -0,0 +1,117 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT
+from executorch.backends.nxp.backend.edge_helper import input_rank
+from executorch.backends.nxp.backend.ir.converter.conversion import translator
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    CustomDelegationOptions,
+    NodeConverter,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    batch_mat_mul_options,
+)
+from executorch.backends.nxp.backend.neutron_operator_support import (
+    transposition_is_supported_on_neutron,
+)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class BMMConverter(NodeConverter):
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if len(node.all_input_nodes) != 2:
+            return False
+
+        if input_rank(node, 0) != 3 or input_rank(node, 1) != 3:
+            return False
+
+        return True
+
+    @staticmethod
+    def _is_supported_on_target(
+        node: Node,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        is_ch_first_1 = node.args[0].meta[NXP_NODE_FORMAT].is_channels_first()
+        is_ch_first_2 = node.args[1].meta[NXP_NODE_FORMAT].is_channels_first()
+        # This combination of node formats is not supported on Neutron (`adj_x = True`, `adj_y = False`),
+        # but it should never happen because both input tensors are expected to share the same format.
+        if is_ch_first_1 and not is_ch_first_2:
+            return False
+
+        # In case we need to insert transpose after `BatchMatMul`, we also need to check if
+        # such transposition is supported.
+        if node.meta[NXP_NODE_FORMAT].is_channels_first():
+            tensor_shape = node.meta["val"].shape
+            tensor_rank = len(tensor_shape)
+            perm = translator.create_channels_first_to_channels_last_permutation(
+                tensor_rank, return_list=True
+            )
+
+            tensor_shape_channels_last = [tensor_shape[i] for i in perm]
+            if not transposition_is_supported_on_neutron(
+                tensor_shape_channels_last, perm, neutron_target_spec
+            ):
+                return False
+
+        _, d1, d2 = node.args[0].meta["val"].shape
+        _, d3, d4 = node.args[1].meta["val"].shape
+
+        # The Neutron converter requires that every dimension participating in the
+        # multiplication is divisible by NUM_MACS.
+        num_macs = neutron_target_spec.get_num_macs()
+        if not all(m % num_macs == 0 for m in [d1, d2, d3, d4]):
+            return False
+
+        return True
+
+    def convert(self, node: Node):
+        """Convert the `aten.bmm` operator to TFLite `BatchMatMul`."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        # We set `adj_x = adj_y = True` when the inputs are in channels‑last format so
+        # that TFLite internally transposes them to channels‑first. In that case, the
+        # output also becomes channels‑first, so we need to transpose it back to
+        # channels‑last afterward.
+        #
+        # We set `asymmetric_quantize_inputs = False`. Neutron ignores this parameter
+        # entirely, so its value does not affect delegation and can be set arbitrarily.
+        is_ch_first_1 = node.args[0].meta[NXP_NODE_FORMAT].is_channels_first()
+        is_ch_first_2 = node.args[1].meta[NXP_NODE_FORMAT].is_channels_first()
+        t_op.builtin_options = batch_mat_mul_options.BatchMatMul(
+            is_ch_first_1, is_ch_first_2, False
+        )
+
+        x1 = t_op.tmp_inputs[0]
+        x2 = t_op.tmp_inputs[1]
+        y = t_op.tmp_outputs[0]
+
+        # Assign the operator its TFLite inputs and outputs
+        t_op.tmp_inputs = [x1, x2]
+        t_op.tmp_outputs = [y]
+
+        ops = OpsList(middle_op=t_op)
+
+        # Transpose back to channels-last if needed.
+        if node.meta[NXP_NODE_FORMAT].is_channels_first():
+            tensor_rank = len(node.meta["val"].shape)
+            perm = translator.create_channels_first_to_channels_last_permutation(
+                tensor_rank, return_list=True
+            )
+            ops.add_post(self.builder.create_transpose_operator_after(t_op, 0, perm))
+
+        self.builder.append_operators(ops.flatten())
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
@@ -205,6 +205,7 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
     exir_ops.edge.aten.addmm.default: AddMMConverter,  # noqa F405
     exir_ops.edge.aten.add.Tensor: AddTensorConverter,  # noqa F405
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.bmm.default: BMMConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clamp.default: ClampConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
@@ -20,6 +20,7 @@
     AvgPool1DPattern,
     AvgPool2DPattern,
     BatchNormPattern,
+    BMMPattern,
     CatPattern,
     ClampPattern,
     Conv1dPattern,
@@ -262,6 +263,7 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec, is_qat: bool = False)
                 OpQuantizer(AvgPool1DPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(AvgPool2DPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(BatchNormPattern(is_qat=is_qat), static_qconfig),
+                OpQuantizer(BMMPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(CatPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(ClampPattern(is_qat=is_qat), static_qconfig),
                 OpQuantizer(Conv1dPattern(is_qat=is_qat), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
@@ -298,6 +298,29 @@ def get_anchors(
         )
 
 
+class BMMPattern(QuantizationPattern):
+    """
+    Quantizer for BatchMatMul operator.
+    """
+
+    def partition_types(self) -> list[torch.nn.Module]:
+        return [torch.ops.aten.bmm.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        bmm_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[
+                (bmm_node, NodeArgsIdx(0)),
+                (bmm_node, NodeArgsIdx(1)),
+            ],
+            biases=[],
+            output=[(bmm_node,)],
+        )
+
+
 class SubTensorPattern(QuantizationPattern):
     """
     Quantization pattern for Sub Tensor quantization. Accepts 1 or 2 input nodes.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
diff --git a/docs/source/backends/nxp/op-support.csv b/docs/source/backends/nxp/op-support.csv