Arm backend: TOSAQuantizerV2 fixes (pytorch#20031)

AdrianLundell · rascani · web-flow · commit 5563ee99eed6 · 2026-06-05T09:01:59.000+02:00
Break out fixes from pytorch#19758 as discussed in pytorch#19966 --------- Signed-off-by: Adrian Lundell <adrian.lundell@arm.com> Co-authored-by: RJ Ascani <rja@meta.com>
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -243,6 +243,18 @@ class PatternQuantizer(Quantizer, QuantizerReporterUser):
 
     """
 
+    PARAMETER_TARGETS = {
+        torch.ops.aten.linear.default,
+        torch.ops.aten.convolution.default,
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.conv1d.padding,
+        torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv2d.padding,
+        torch.ops.aten.conv3d.default,
+        torch.ops.aten.conv3d.padding,
+        torch.ops.aten.conv_transpose2d.input,
+    }
+
     def __init__(
         self,
         quantization_config: QuantizationConfig | None,
@@ -275,75 +287,59 @@ def get_quantizer_info(self):
             support_config_path,
         )
 
-    def is_parameter(self, node: Node, model: torch.fx.GraphModule) -> bool:
-        """Returns True if the given node is a parameter of the model."""
-        try:
-            _ = model.get_parameter(node.target)  # type: ignore[arg-type]
-            return True
-        except Exception:
+    def is_weight(self, node: Node) -> bool:
+        """Returns True if node is used as a weight by all users."""
+        if node.op != "get_attr":
             return False
 
-    def is_weight(
-        self, node: Node, params: list[Node], model: torch.fx.GraphModule
-    ) -> bool:
-        """Returns True if node is the first parameter of the given
-        parameters.
-        """
-        return len(params) > 0 and node == params[0]
+        # Ensure that the node is used as a weight by all users
+        for user_node in node.users:
+            if user_node.target not in self.PARAMETER_TARGETS:
+                return False
 
-    def is_bias(
-        self, node: Node, params: list[Node], model: torch.fx.GraphModule
-    ) -> bool:
-        """Returns True if node is the second parameter of the given
-        parameters.
-        """
-        return len(params) == 2 and node == params[1]
+            args = list(user_node.args)
+            if not (len(args) > 1 and node == args[1]):
+                return False
+
+        return True
+
+    def is_bias(self, node: Node) -> bool:
+        """Returns True if node is used as a bias by all users."""
+        if node.op != "get_attr":
+            return False
+
+        # Ensure that the node is used as a bias by all users
+        for user_node in node.users:
+            if user_node.target not in self.PARAMETER_TARGETS:
+                return False
+
+            args = list(user_node.args)
+            if not (len(args) > 2 and node == args[2]):
+                return False
+
+        return True
 
     def annotate_match(
         self,
         match: list[Node],
         config: QuantizationConfig | None,
-        model: torch.fx.GraphModule,
     ) -> None:
         """Annotates a matched pattern according to the given quantization
         config.
         """
-        parameter_targets = {
-            torch.ops.aten.linear.default,
-            torch.ops.aten.convolution.default,
-            torch.ops.aten.conv1d.default,
-            torch.ops.aten.conv1d.padding,
-            torch.ops.aten.conv2d.default,
-            torch.ops.aten.conv2d.padding,
-            torch.ops.aten.conv3d.default,
-            torch.ops.aten.conv3d.padding,
-            torch.ops.aten.conv_transpose2d.input,
-        }
 
         for node in match:
             input_qspec_map = {}
             output_qspec = None
 
-            params = [n for n in node.all_input_nodes if self.is_parameter(n, model)]
-            if node.target in parameter_targets:
-                if len(params) == 0 or len(params) > 2:
-                    logger.warning(
-                        f"{node.name} is expected to have parameter tensors for weight/bias but no such inputs found, which may cause unexpected quantization annotations. This is likely caused by incorrect tensor instantiations or non-constant weight/biases."
-                    )
-            else:
-                if len(params) > 0:
-                    logger.warning(
-                        f"{node.name} is not expected to not have parameter tensors but found {[n.name for n in params]}, which may cause unexpected quantization annotations."
-                    )
-
             for input_node in node.all_input_nodes:
                 if not has_float_output(input_node):
                     continue
-                if self.is_weight(input_node, params, model):
+                if self.is_weight(input_node):
                     input_qspec_map[input_node] = (
                         config.get_weight_qspec(node) if config else None
                     )
-                elif self.is_bias(input_node, params, model):
+                elif self.is_bias(input_node):
                     input_qspec_map[input_node] = (
                         config.get_bias_qspec(node) if config else None  # type: ignore[assignment]
                     )
@@ -370,7 +366,7 @@ def annotate(self, model: torch.fx.GraphModule) -> None:  # type: ignore[overrid
         )
         for result in matches:
             if result.accepted:
-                self.annotate_match(result.pattern, self.quantization_config, model)
+                self.annotate_match(result.pattern, self.quantization_config)
                 self.report_accept(result.pattern)
             else:
                 self.report_reject(
@@ -424,6 +420,9 @@ class SharedQspecQuantizer(Quantizer, QuantizerReporterUser):
         torch.ops.aten.flip.default,
         torch.ops.aten.index_select.default,
         torch.ops.aten.index_put.default,
+        torch.ops.aten.index_put_.default,
+        torch.ops.aten.index_copy.default,
+        torch.ops.aten.index_copy_.default,
         torch.ops.aten.contiguous.default,
         torch.ops.aten.as_strided_copy.default,
         torch.ops.aten.pixel_shuffle.default,
@@ -571,6 +570,42 @@ def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
 
         return shared_nodes, adjacent_qspecs
 
+    def _should_skip_while_shared_qspec(self, node: Node) -> bool:
+        return node.target == torch.ops.higher_order.while_loop and bool(
+            node.meta.get("additional_inputs")
+        )
+
+    def _annotate_while_with_additional_inputs(
+        self,
+        root_node: Node,
+        adjacent_qspecs: list[Any],
+    ) -> bool:
+        if not self._should_skip_while_shared_qspec(root_node):
+            return False
+        if len(adjacent_qspecs) == 0:
+            self.report_reject(
+                [root_node],
+                "Couldn't find any adjacent quantization spec to annotate while_loop.",
+            )
+            return True
+
+        input_qspec = adjacent_qspecs[0]
+        input_qspec_map: dict[Node, Optional[QuantizationSpec]] = {
+            n: input_qspec for n in self._get_input_nodes_with_float_output(root_node)
+        }
+        output_qspec: Optional[QuantizationSpec] = None
+        if len(self._get_user_nodes_with_float_input(root_node)) > 0:
+            output_qspec = input_qspec
+
+        _mark_node_as_quantized(
+            root_node,
+            input_qspec_map,
+            output_qspec,
+            is_quantized=True,
+        )
+        self.report_accept([root_node])
+        return True
+
     def _annotate_shared_cluster(self, root_node: Node) -> None:
         if (
             len(self._get_input_nodes_with_float_output(root_node)) == 0
@@ -592,9 +627,11 @@ def _annotate_shared_cluster(self, root_node: Node) -> None:
         node_order = {node: index for index, node in enumerate(root_node.graph.nodes)}
         ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0))
 
+        if self._annotate_while_with_additional_inputs(root_node, adjacent_qspecs):
+            return
+
         # Ensure the root node is the first one in the graph.
         root_node = ordered_nodes[0]
-
         if len(adjacent_qspecs) > 0:
             root_node_float_inputs = self._get_input_nodes_with_float_output(root_node)
             if len(root_node_float_inputs) > 0:
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -21,6 +21,7 @@
 from executorch.backends.arm.common.type import ensure_type
 from executorch.backends.arm.quantizer import QuantizationConfig
 
+from torch._ops import OpOverload
 from torch._subclasses import FakeTensor
 from torch.fx import Node
 from torchao.quantization.pt2e import (
@@ -441,7 +442,7 @@ def _match_pattern(
     return left_condition and right_condition
 
 
-_conv_ops = {
+_conv_ops: set[OpOverload] = {
     torch.ops.aten.conv1d.default,
     torch.ops.aten.conv2d.default,
     torch.ops.aten.conv2d.padding,
@@ -473,7 +474,7 @@ def _match_pattern(
     },
 }
 
-_one_to_one = {
+_one_to_one: set[OpOverload] = {
     torch.ops.aten.abs.default,
     torch.ops.aten.ceil.default,
     torch.ops.aten.erf.default,
@@ -514,7 +515,7 @@ def _match_pattern(
     torch.ops.aten.tan.default,
 }
 
-_one_to_one_shared_input_qspec = {
+_one_to_one_shared_input_qspec: set[OpOverload] = {
     torch.ops.aten.squeeze.default,
     torch.ops.aten.squeeze_copy.default,
     torch.ops.aten.squeeze_copy.dim,
@@ -574,7 +575,7 @@ def _match_pattern(
     torch.ops.aten.detach_copy.default,
 }
 
-_one_to_one_shared_input_or_input_act_qspec = {
+_one_to_one_shared_input_or_input_act_qspec: set[OpOverload] = {
     torch.ops.aten.alias.default,
     torch.ops.aten.clone.default,
     torch.ops.aten.hardtanh.default,
diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py
@@ -21,6 +21,7 @@
 
 from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
+    FixedQParamsQuantizationSpec,
     QuantizationSpec,
     QuantizationSpecBase,
     SharedQuantizationSpec,
@@ -284,10 +285,18 @@ def get_input_act_qspec(self, node=None, input_node=None):
 
         For comparison operators, make sure that both inputs share the same
         quantization spec, by returning a SharedQuantizationSpec that ties the
-        quantization of both inputs together. For other operators, return the
-        default input activation spec.
+        quantization of both inputs together.
+
+        For trigonometric ops, ensure that input spec has fixed qparams.
+
+        For other operators, return the default input activation spec.
 
         """
+        # MLETORCH-1853: Fix lazy import when moving files around
+        from executorch.backends.arm.quantizer.quantization_annotator import (
+            _fixed_input_qspec_ops,
+        )
+
         if node is None or input_node is None:
             return super().get_input_act_qspec(node, input_node)
 
@@ -296,6 +305,29 @@ def get_input_act_qspec(self, node=None, input_node=None):
                 return super().get_input_act_qspec(node, input_node)
             else:
                 return SharedQuantizationSpec((node.args[0], node))
+        elif node.target in _fixed_input_qspec_ops:
+
+            input_act_qspec = super().get_input_act_qspec(node, input_node)
+            if not hasattr(input_act_qspec, "dtype") or not isinstance(
+                input_act_qspec.dtype, torch.dtype
+            ):
+                raise ValueError(
+                    f"{node.target} requires an input activation quantization "
+                    "spec to use fixed input qparams."
+                )
+            dtype = getattr(input_act_qspec, "dtype", None)
+            num_bits = torch.iinfo(dtype).bits
+
+            qparams = _fixed_input_qspec_ops[node.target][num_bits]
+            return FixedQParamsQuantizationSpec(
+                dtype=dtype,
+                scale=qparams.scale,
+                zero_point=qparams.zero_point,
+                quant_min=input_act_qspec.quant_min,
+                quant_max=input_act_qspec.quant_max,
+                qscheme=input_act_qspec.qscheme,
+                is_dynamic=input_act_qspec.is_dynamic,
+            )
 
         return super().get_input_act_qspec(node, input_node)
 
diff --git a/backends/arm/quantizer/quantizer_support.py b/backends/arm/quantizer/quantizer_support.py
@@ -77,8 +77,6 @@ def check_pattern(cls, pattern):
     torch.ops.aten.relu_.default,
     torch.ops.aten.hardtanh.default,
     torch.ops.aten.hardtanh_.default,
-    torch.ops.aten.hardsigmoid.default,
-    torch.ops.aten.hardsigmoid_.default,
     torch.ops.aten.clamp.default,
     torch.ops.aten.clamp_.default,
 ]
@@ -168,6 +166,14 @@ def check_pattern(cls, pattern):
         (torch.ops.aten.ge.Scalar,),
         (torch.ops.aten.eq.Scalar,),
         (torch.ops.aten.ne.Scalar,),
+        (torch.ops.aten.lstm.input,),
+        (torch.ops.aten.rnn_tanh.input,),
+        (torch.ops.aten.rnn_relu.input,),
+        (torch.ops.aten.gru.input,),
+        (torch.ops.aten.asin.default,),
+        (torch.ops.aten.acos.default,),
+        (torch.ops.aten.atanh.default,),
+        (torch.ops.aten.einsum.default,),
     ]
 )
 TOSA_QUANTIZER_SUPPORT_DICT: dict[tuple[OpOverload, ...], type[PatternCheck] | None] = {
diff --git a/backends/arm/scripts/docgen/docgen.py b/backends/arm/scripts/docgen/docgen.py
@@ -46,7 +46,9 @@ def get_docstring(obj) -> str:
 
     lines = docstring.split("\n")
     for line in lines:
-        if ":" in line and line.startswith(" "):
+        # Only first-level arg lines should become bullets.
+        is_arg_line = line.startswith("    ") and not line.startswith("        ")
+        if ":" in line and is_arg_line:
             new_line = line.strip()
             pos = new_line.index(":")
             new_line = f"- **{new_line[:pos]}**" + new_line[pos:]
diff --git a/backends/cortex_m/test/misc/test_portable_int8.py b/backends/cortex_m/test/misc/test_portable_int8.py
@@ -301,6 +301,36 @@ def _quantize_and_export(
         (torch.randn(6), torch.randn(6)),
         torch.int64,
     ),
+    "index_put_": OpCase(
+        torch.ops.aten.index_put_.default,
+        _build_module(
+            lambda x, y: torch.ops.aten.index_put_.default(
+                x, (torch.tensor([1, 3]),), torch.tensor([1.0, 2.0]), False
+            )
+        ),
+        (torch.randn(6), torch.randn(6)),
+        torch.int64,
+    ),
+    "index_copy": OpCase(
+        torch.ops.aten.index_copy.default,
+        _build_module(
+            lambda x, y: torch.ops.aten.index_copy.default(
+                x, 0, torch.tensor([0, 2]), y
+            )
+        ),
+        (torch.randn(4, 5), torch.randn(2, 5)),
+        torch.int64,
+    ),
+    "index_copy_": OpCase(
+        torch.ops.aten.index_copy_.default,
+        _build_module(
+            lambda x, y: torch.ops.aten.index_copy_.default(
+                x, 0, torch.tensor([0, 2]), y
+            )
+        ),
+        (torch.randn(4, 5), torch.randn(2, 5)),
+        torch.int64,
+    ),
     "contiguous": OpCase(
         torch.ops.aten.contiguous.default,
         _build_module(lambda x, y: torch.ops.aten.contiguous.default(x)),
diff --git a/docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md b/docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md
@@ -20,7 +20,7 @@ In this tutorial you will learn how to export a simple PyTorch model for the Exe
 ```{tip}
 If you are already familiar with this delegate, you may want to jump directly to the examples:
 * [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
-* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/backends/arm/scripts/aot_arm_compiler.py)
+* [A commandline compiler for quick tests and example models](https://github.com/pytorch/executorch/blob/main/backends/arm/scripts/aot_arm_compiler.py)
 ```
 
 This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on Arm&reg; Ethos&trade;-U targets. It is based on `ethos_u_minimal_example.ipynb`, provided in Arm’s examples folder.
@@ -142,9 +142,10 @@ save_pte_program(executorch_program_manager, "ethos_u_minimal_example.pte")
 
 
 ```{tip}
-For a quick start, you can use the script `backends/arm/scripts/aot_arm_compiler.py` to produce the pte.
+For a quick test, you can use the script `backends/arm/scripts/aot_arm_compiler.py` to produce the pte.
 To produce a pte file equivalent to the one above, run
-`python -m backends.arm.scripts.aot_arm_compiler --model_name=add --delegate --quantize --output=ethos_u_minimal_example.pte`
+`python -m backends.arm.scripts.aot_arm_compiler --model_name=add --delegate --quantize --output=ethos_u_minimal_example.pte`.
+For production use, you should instead use the stable Python API shown above.
 ```
 
 ### Runtime:
diff --git a/docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md b/docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md
diff --git a/examples/models/llama/tests/test_export_llama_lib.py b/examples/models/llama/tests/test_export_llama_lib.py