pytorch
diff --git a/‎backends/cortex_m/ops/operators.py‎
Lines changed: 7 additions & 6 deletions b/‎backends/cortex_m/ops/operators.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎backends/cortex_m/passes/aten_to_cortex_m_pass.py‎
Lines changed: 45 additions & 21 deletions b/‎backends/cortex_m/passes/aten_to_cortex_m_pass.py‎
Lines changed: 45 additions & 21 deletions
diff --git a/‎backends/cortex_m/test/ops/test_linear.py‎
Lines changed: 161 additions & 1 deletion b/‎backends/cortex_m/test/ops/test_linear.py‎
Lines changed: 161 additions & 1 deletion
@@ -467,8 +467,8 @@ def quantized_linear_meta(
 def quantized_linear_impl(
     input: torch.Tensor,
     weights: torch.Tensor,
-    bias: torch.Tensor,
-    kernel_sum: torch.Tensor,
+    bias: torch.Tensor | None,
+    kernel_sum: torch.Tensor | None,
     input_offset: int,
     filter_offset: int,
     output_offset: int,
@@ -481,10 +481,11 @@ def quantized_linear_impl(
     Functional variant - creates output tensor and calls out variant
     """
 
-    # Leaving both implementations for debugging purposes.
-    compute_using_kernel_sum = True
-
-    if compute_using_kernel_sum:
+    # Mirror CMSIS-NN's arm_fully_connected_s8 contract: the MVE path reads
+    # kernel_sum (ctx.buf) and ignores bias; the DSP and scalar paths read
+    # bias and ignore kernel_sum. The AOT pass populates exactly one of them
+    # based on the target ISA, so dispatch off which one is present.
+    if kernel_sum is not None:
         weights_int32 = weights.to(torch.int32)
 
         input_int32 = input.to(torch.int32)
 
@@ -7,6 +7,7 @@
 
 from typing import cast
 
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import executorch.backends.cortex_m.ops.operators  # noqa
 import executorch.exir as exir
 import torch
@@ -146,7 +147,7 @@ def _has_qparams(node: Node) -> bool:
 @AtenToCortexMPass.register_dialect_substitution(exir_ops.edge.aten.tanh.default)
 @AtenToCortexMPass.register_dialect_substitution(exir_ops.edge.aten.silu.default)
 def _get_activation_replacement(
-    node: Node, exported_program: ExportedProgram
+    node: Node, dialect_pass: AtenToDialectPass
 ) -> DialectNodeSpec | None:
     """Lower a standalone quantized sigmoid / tanh / silu to a single
     cortex_m.quantized_activation call backed by an AoT-built 256-entry
@@ -156,6 +157,7 @@ def _get_activation_replacement(
     if not _has_qparams(node):
         return None
 
+    exported_program = dialect_pass.exported_program
     input_qparams = node.meta["input_qparams"][0]
     output_qparams = node.meta["output_qparams"][0]
     lut_tensor = build_activation_lut(
@@ -187,7 +189,7 @@ def _get_activation_replacement(
 
 @AtenToCortexMPass.register_dialect_substitution(exir_ops.edge.aten.linear.default)
 def _get_linear_replacement(
-    node: Node, exported_program: ExportedProgram
+    node: Node, dialect_pass: AtenToDialectPass
 ) -> DialectNodeSpec | None:
     """
     Let
@@ -209,6 +211,10 @@ def _get_linear_replacement(
     if not _has_qparams(node):
         return None
 
+    assert isinstance(dialect_pass, AtenToCortexMPass)
+    exported_program = dialect_pass.exported_program
+    target_config = dialect_pass.target_config
+
     input_scale = node.meta["input_qparams"][0].scale
     input_zp = node.meta["input_qparams"][0].zp
     weight_scale = node.meta["input_qparams"][1].scale
@@ -218,37 +224,52 @@ def _get_linear_replacement(
     output_min = node.meta["output_qparams"][0].qmin
     output_max = node.meta["output_qparams"][0].qmax
 
+    if weight_zp != 0:
+        raise NotImplementedError(
+            f"cortex_m::quantized_linear assumes symmetric weight "
+            f"quantization (weight_zp == 0); got weight_zp={weight_zp}"
+        )
+
     quantized_multiplier, quantized_shift = quantize_multiplier_aot(
         (input_scale * weight_scale) / output_scale
     )
 
-    # TODO: Add support for configuring the backend to support other extensions.
-    # Kernel sum is only used in the CMSIS-NN implementation for the MVE extension,
-    # so this should be optional.
+    # CMSIS-NN's MVE `arm_fully_connected_s8` path reads a precomputed
+    # kernel_sum (input_offset×sum(weight) + bias) from ctx.buf and
+    # ignores the bias argument. The DSP and scalar paths do the opposite
+    # — they read the bias argument at runtime and ignore ctx.buf
+    # (see arm_nn_vec_mat_mult_t_s8.c). Pick the right input format here
+    # based on the target ISA so the runtime gets exactly what it expects.
     linear_args = node.args
     weights = cast(Node, linear_args[1])
     weights_tensor = get_param_tensor(exported_program, weights)
     bias_node = cast(Node | None, linear_args[2]) if len(linear_args) > 2 else None
     bias_tensor = (
         get_param_tensor(exported_program, bias_node) if bias_node is not None else None
     )
-    kernel_sum_tensor = _compute_kernel_sum(
-        weights_tensor, bias_tensor, -input_zp, -weight_zp
-    )
-    with node.graph.inserting_after(weights):
-        kernel_sum = create_constant_placeholder(
-            exported_program,
-            node.graph,
-            node.name + "_kernel_sum",
-            InputKind.PARAMETER,
-            kernel_sum_tensor,
+
+    if target_config.backend == cmsis_nn.Backend.MVE:
+        kernel_sum_tensor = _compute_kernel_sum(
+            weights_tensor, bias_tensor, -input_zp, -weight_zp
         )
+        with node.graph.inserting_after(weights):
+            kernel_sum_arg = create_constant_placeholder(
+                exported_program,
+                node.graph,
+                node.name + "_kernel_sum",
+                InputKind.PARAMETER,
+                kernel_sum_tensor,
+            )
+        bias_arg = None
+    else:
+        kernel_sum_arg = None
+        bias_arg = bias_node
 
     args = (
         linear_args[0],
         weights,
-        None,
-        kernel_sum,
+        bias_arg,
+        kernel_sum_arg,
         -input_zp,
         -weight_zp,
         output_zp,
@@ -263,11 +284,12 @@ def _get_linear_replacement(
 
 @AtenToCortexMPass.register_dialect_substitution(exir_ops.edge.aten.convolution.default)
 def _get_convolution_replacement(
-    node: Node, exported_program: ExportedProgram
+    node: Node, dialect_pass: AtenToDialectPass
 ) -> DialectNodeSpec | None:
     if not _has_qparams(node):
         return None
 
+    exported_program = dialect_pass.exported_program
     conv_args = node.args
     (
         x,
@@ -292,7 +314,7 @@ def _get_convolution_replacement(
     )
 
     if transposed:
-        return _get_transpose_conv2d_replacement(node, exported_program)
+        return _get_transpose_conv2d_replacement(node, dialect_pass)
 
     input_scale = node.meta["input_qparams"][0].scale
     input_zero_point = node.meta["input_qparams"][0].zp
@@ -437,14 +459,15 @@ def _get_convolution_replacement(
 
 
 def _get_transpose_conv2d_replacement(
-    node: Node, exported_program: ExportedProgram
+    node: Node, dialect_pass: AtenToDialectPass
 ) -> DialectNodeSpec | None:
     """
     Transform aten.convolution with transposed=True to cortex_m.quantized_transpose_conv2d.
     """
     if not _has_qparams(node):
         return None
 
+    exported_program = dialect_pass.exported_program
     conv_t_args = node.args
     (
         x,
@@ -562,11 +585,12 @@ def _get_transpose_conv2d_replacement(
 
 @AtenToCortexMPass.register_dialect_substitution(exir_ops.edge.aten.bmm.default)
 def _get_bmm_replacement(
-    node: Node, exported_program: ExportedProgram
+    node: Node, dialect_pass: AtenToDialectPass
 ) -> DialectNodeSpec | None:
     if not _has_qparams(node):
         return None
 
+    exported_program = dialect_pass.exported_program
     lhs_scale = node.meta["input_qparams"][0].scale
     lhs_zp = node.meta["input_qparams"][0].zp
     rhs_scale = node.meta["input_qparams"][1].scale
 
@@ -1,16 +1,21 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 
+from dataclasses import dataclass
+
 import torch
 from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
 from executorch.backends.cortex_m.test.tester import (
     CortexMTester,
     McuTestCase,
     ramp_tensor,
 )
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir.dialects._ops import ops as exir_ops
 
 
 class CortexMLinear(torch.nn.Module):
@@ -128,3 +133,158 @@ def test_dialect_linear(test_case):
 def test_implementation_linear(test_case):
     tester = CortexMTester(test_case.model, test_case.example_inputs)
     tester.test_implementation(qtol=1)
+
+
+# ---------------------------------------------------------------------------
+# Regression: cortex_m::quantized_linear must pick the right CMSIS-NN input
+# convention based on the target ISA. `arm_fully_connected_s8` reads
+# kernel_sum (ctx.buf) on MVE/Helium and reads the bias argument on DSP/scalar
+# paths; the two are mutually exclusive. Previously the pass unconditionally
+# emitted the MVE shape, which silently dropped the bias and input-offset
+# terms on every non-MVE build. The regression only showed up when those
+# terms dominated the int32 accumulator -- i.e., on small-magnitude inputs.
+#
+# Coverage strategy: a single ISA-parametrized dialect test verifies the
+# numeric output against the float reference (catches the dropped-bias bug
+# directly), checks ops_after_transforms to confirm the linear lowered, and
+# asserts the post-pass node has the value in the slot the configured ISA
+# expects -- the structural guard against a regression that emits zero-valued
+# kernel_sum on a no-bias DSP path (numerically inert, but wrong shape).
+# An additional implementation test drives the default M55 MVE build path
+# through the simulator.
+# ---------------------------------------------------------------------------
+
+
+class _SmallMagnitudeLinear(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 4,
+    }
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def __init__(self, bias: bool = True):
+        super().__init__()
+        self.fc = torch.nn.Linear(512, 10, bias=bias)
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+class _SmallMagnitudeLinearNoBias(_SmallMagnitudeLinear):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    def __init__(self):
+        super().__init__(bias=False)
+
+
+def _small_magnitude_input():
+    return torch.rand(1, 512) * 0.002
+
+
+_small_magnitude_calibration = [(_small_magnitude_input(),) for _ in range(8)]
+
+
+@dataclass(frozen=True)
+class _SmallMagnitudeVariant:
+    case: McuTestCase
+    target_config: CortexMTargetConfig
+    uses_kernel_sum: bool
+    has_bias: bool
+
+
+def _small_magnitude_variant(
+    model_cls, cpu: CortexM, *, uses_kernel_sum: bool, has_bias: bool
+) -> _SmallMagnitudeVariant:
+    return _SmallMagnitudeVariant(
+        case=McuTestCase(
+            model=model_cls().eval(),
+            example_inputs=lambda: (_small_magnitude_input(),),
+        ),
+        target_config=CortexMTargetConfig(cpu=cpu),
+        uses_kernel_sum=uses_kernel_sum,
+        has_bias=has_bias,
+    )
+
+
+# bias=True covers the regression directly (the bug dropped the bias term);
+# bias=False covers the symmetric case where only the input-offset term is
+# missing on the non-MVE paths.
+small_magnitude_variants = {
+    "mve_bias": _small_magnitude_variant(
+        _SmallMagnitudeLinear, CortexM.M55, uses_kernel_sum=True, has_bias=True
+    ),
+    "dsp_bias": _small_magnitude_variant(
+        _SmallMagnitudeLinear, CortexM.M4, uses_kernel_sum=False, has_bias=True
+    ),
+    "scalar_bias": _small_magnitude_variant(
+        _SmallMagnitudeLinear, CortexM.M0PLUS, uses_kernel_sum=False, has_bias=True
+    ),
+    "mve_nobias": _small_magnitude_variant(
+        _SmallMagnitudeLinearNoBias, CortexM.M55, uses_kernel_sum=True, has_bias=False
+    ),
+    "dsp_nobias": _small_magnitude_variant(
+        _SmallMagnitudeLinearNoBias, CortexM.M4, uses_kernel_sum=False, has_bias=False
+    ),
+    "scalar_nobias": _small_magnitude_variant(
+        _SmallMagnitudeLinearNoBias,
+        CortexM.M0PLUS,
+        uses_kernel_sum=False,
+        has_bias=False,
+    ),
+}
+
+
+@parametrize("variant", small_magnitude_variants)
+def test_dialect_linear_small_magnitude(variant: _SmallMagnitudeVariant):
+    tester = CortexMTester(
+        variant.case.model,
+        variant.case.get_example_inputs(),
+        target_config=variant.target_config,
+    )
+    tester.test_dialect(
+        ops_before_transforms=variant.case.model.ops_before_transforms,
+        ops_after_transforms=variant.case.model.ops_after_transforms,
+        qtol=1,
+        calibration_samples=_small_magnitude_calibration,
+    )
+
+    # Structural guard: numeric divergence catches the original dropped-bias
+    # bug, but a future regression that emits zero-valued kernel_sum on a
+    # no-bias DSP/scalar path would be numerically inert. Assert the slot the
+    # configured ISA actually consumes is populated and the unused one is None.
+    module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
+    linear_target = exir_ops.edge.cortex_m.quantized_linear.default
+    [linear_node] = [
+        n
+        for n in module.graph.nodes
+        if n.op == "call_function" and n.target == linear_target
+    ]
+    bias_arg, kernel_sum_arg = linear_node.args[2], linear_node.args[3]
+    if variant.uses_kernel_sum:
+        assert kernel_sum_arg is not None
+        assert bias_arg is None
+    else:
+        assert kernel_sum_arg is None
+        if variant.has_bias:
+            assert bias_arg is not None
+        else:
+            assert bias_arg is None
+
+
+def test_implementation_linear_small_magnitude():
+    """Exercise the MVE kernel_sum codepath via the default M55 simulator build."""
+    case = McuTestCase(
+        model=_SmallMagnitudeLinear().eval(),
+        example_inputs=lambda: (_small_magnitude_input(),),
+    )
+    tester = CortexMTester(case.model, case.get_example_inputs())
+    tester.test_implementation(qtol=1, calibration_samples=_small_magnitude_calibration)