Arm backend: Minor composable_quantizer improvements

AdrianLundell · AdrianLundell · commit e712c3199243 · 2026-05-06T14:19:27.000+02:00
- Make sure the root node of a shared cluster is the topological
  first node of the model to avoid crash in torchao.

- Do not report get_attr as non-annotated as these are never quantized,
  leading to unnecessarily long logs.

- Make SharedQuantization logging more straight to the point. Do not
  warn for multiple users as this is resported in the quantizer report
  and use one single simple log message when nodes are left unquantized.

- Make the pre-transform for annotation report more minimal and only
  print it when relevant + update the notebook example to this fact.

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
Change-Id: I412918235799c99ab4b1b1e1f6412de4c906766f
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -1109,6 +1109,23 @@ def _remove_annotations(self, model: GraphModule) -> GraphModule:
 
         return model
 
+    def _log_nonquantized_nodes(self, model: GraphModule) -> None:
+        non_quantized_nodes = [
+            n
+            for n in model.graph.nodes
+            if n.meta.get(DISALLOW_TFA_META_KEY, True) and n.op != "get_attr"
+        ]
+        if len(non_quantized_nodes) > 0:
+            msg = """
+----------------------------------------------------------------------------------------------------
+                         PRE-TRANSFORM FOR ANNOTATION QUANTIZATION REPORT                                      
+----------------------------------------------------------------------------------------------------
+The following nodes are not marked for quantization and will not be decomposed in the transform for annotation pipeline:\n"""
+            for node in non_quantized_nodes:
+                msg += f"   {node.name}\n"
+
+            logger.debug(msg)
+
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         # Transform_for_annotation should only decompose ops if quantized, which is
         # indicated either by node.meta['DISALLOW_TFA_META_KEY']==False or no such key
@@ -1121,15 +1138,13 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         # run to set DISALLOW_TFA_META_KEY for quantized nodes and all nodes missing
         # this key afterwards are set to DISALLOW_TFA_META_KEY=True.
 
-        reporter = QuantizerReporter(
-            self.quantizers, "PRE-TRANSFORM_FOR_ANNOTATION QUANTIZATION REPORT"  # type: ignore[arg-type]
-        )
         model = super().annotate(model)
-        reporter.log_quantizer_report(model)
         for node in model.graph.nodes:
             if DISALLOW_TFA_META_KEY not in node.meta:
                 node.meta[DISALLOW_TFA_META_KEY] = True
 
+        self._log_nonquantized_nodes(model)
+
         pass_manager = ArmPassManager(self.compile_spec)
         transformed_model = pass_manager.transform_for_annotation_pipeline(model)
 
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -592,46 +592,41 @@ def _annotate_shared_cluster(self, root_node: Node) -> None:
         node_order = {node: index for index, node in enumerate(root_node.graph.nodes)}
         ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0))
 
-        if len(adjacent_qspecs) > 0:
-            if len(adjacent_qspecs) > 1:
-                logger.warning(
-                    f"Multiple adjacent quantization specs found for {', '.join([n.name for n in ordered_nodes])}, all nodes will share the input quantization spec of {root_node.name}."
-                )
+        # Ensure the root node is the first one in the graph.
+        root_node = ordered_nodes[0]
 
+        if len(adjacent_qspecs) > 0:
             root_node_float_inputs = self._get_input_nodes_with_float_output(root_node)
-            if len(root_node_float_inputs) == 0:
-                self.report_reject(
-                    ordered_nodes,
-                    "Couldn't find any floating point input to base shared quantization spec on.",
-                )
-                return
-            root_node_first_input = root_node_float_inputs[0]
-
-            shared_qspec = SharedQuantizationSpec((root_node_first_input, root_node))
-            for node in shared_nodes:
-                input_qspec_map: dict[Node, Optional[QuantizationSpec]] = {
-                    n: shared_qspec  # type: ignore[misc]
-                    for n in self._get_input_nodes_with_float_output(node)
-                }
-                if len(self._get_user_nodes_with_float_input(node)) == 0:
-                    output_qspec = None
-                else:
-                    output_qspec = shared_qspec
-                _mark_node_as_quantized(
-                    node, input_qspec_map, output_qspec, is_quantized=True
+            if len(root_node_float_inputs) > 0:
+
+                root_node_first_input = root_node_float_inputs[0]
+                shared_qspec = SharedQuantizationSpec(
+                    (root_node_first_input, root_node)
                 )
+                for node in shared_nodes:
+                    input_qspec_map: dict[Node, Optional[QuantizationSpec]] = {
+                        n: shared_qspec  # type: ignore[misc]
+                        for n in self._get_input_nodes_with_float_output(node)
+                    }
+                    if len(self._get_user_nodes_with_float_input(node)) == 0:
+                        output_qspec = None
+                    else:
+                        output_qspec = shared_qspec
+                    _mark_node_as_quantized(
+                        node, input_qspec_map, output_qspec, is_quantized=True
+                    )
 
-            root_node.meta[Q_ANNOTATION_KEY].input_qspec_map[root_node_first_input] = (
-                adjacent_qspecs[0]
-            )
-            self.report_accept(ordered_nodes)
+                root_node.meta[Q_ANNOTATION_KEY].input_qspec_map[
+                    root_node_first_input
+                ] = adjacent_qspecs[0]
+                self.report_accept(ordered_nodes)
+                return
 
-        else:
-            self.report_reject(
-                ordered_nodes,
-                "Couldn't find any adjacent quantization spec to base shared quantization spec on. You may however quantize these nodes manually if required.",
-            )
-            return
+        self.report_reject(
+            ordered_nodes,
+            "All inputs and outputs to these nodes are non-quantized.",
+        )
+        return
 
     def annotate(self, model: torch.fx.GraphModule) -> None:  # type: ignore[override]
         for node in model.graph.nodes:
diff --git a/backends/cortex_m/quantizer_reporter.py b/backends/cortex_m/quantizer_reporter.py
@@ -383,7 +383,9 @@ def unannotated_nodes_report(
             non_quantized_nodes: list[Node] = []
         else:
             non_quantized_nodes = [
-                node for node in model.graph.nodes if Q_ANNOTATION_KEY not in node.meta
+                node
+                for node in model.graph.nodes
+                if Q_ANNOTATION_KEY not in node.meta and node.op != "get_attr"
             ]
 
         rows = []
diff --git a/examples/arm/quantizer_tutorial.ipynb b/examples/arm/quantizer_tutorial.ipynb
@@ -167,9 +167,25 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### The quantization report\n",
+    "### Pre-transform for annotation quantization report\n",
+    "Note that there are two quantization reports printed in this case, this is because the quantization\n",
+    "annotator is run twice during the quantization for two different purposes.\n",
     "\n",
-    "In the logged quantization report each quantizer has added one header describing targeted nodes, the used quantization config, and the supported operators / operator patterns. \n",
+    "1. Mark nodes to be decomposed by the `transform_for_annotation` pipeline (PRE-TRANSFORM FOR ANNOTATION QUANTIZATION REPORT)\n",
+    "2. Perform actual quantization after the `transform_for_annotation` pipeline (FINAL QUANTIZATION REPORT)\n",
+    "\n",
+    "Consider for example the `torch.div` operator, which is decomposed into one multiplication and one\n",
+    "reciprocal operator when quantized. Since both ops in the decomposition require different quantization parameters, this decomposition needs to happen before quantization, in the `transform_for_annotation` pipeline, but on the other hand this\n",
+    "decomposition must not happen if it should be kept in float.\n",
+    "\n",
+    "**This is important to be aware of when doing mixed quantization since this means that for an operator to be fully quantized,\n",
+    "both the original operator and the decomposition needs to be targeted.**\n",
+    "\n",
+    "The pre-transform for annotation report prints all nodes which are not marked for decomposition (if any) to make this\n",
+    "easy to get right. If regular full-graph quantization is done, this is simply skipped.\n",
+    "\n",
+    "### The final quantization report\n",
+    "In the second quantization report, each quantizer has added one header describing targeted nodes, the used quantization config, and the supported operators / operator patterns. \n",
     "```\n",
     "PatternQuantizer using NodeNameNodeFinder targeting names: conv2d, relu\n",
     "Annotating with executorch.backends.arm.quantizer.arm_quantizer.get_symmetric_quantization_config(is_per_channel=True)\n",
@@ -187,7 +203,7 @@
     "```\n",
     "       NODE NAME    INPUT QSPEC MAP                           OUTPUT QSPEC MAP\n",
     "   --  -----------  ----------------------------------------  ---------------------\n",
-    "   ╒   conv2d       x: INT8_PER_TENSOR_QSPEC                  NO_QSPEC\n",
+    "   ╒   conv2d       x: INT8_PER_TENSOR_QSPEC                  None\n",
     "   |                _param_constant0: INT8_PER_CHANNEL_QSPEC\n",
     "   |                _param_constant1: DERIVED_QSPEC\n",
     "   ╘   relu  \n",
@@ -198,16 +214,6 @@
     "many different quantization annotations for different types of tensors; per tensor for\n",
     "activations, per channel for weights, and a special quantization spec for the int32 bias. \n",
     "\n",
-    "### Pre-transform for annotation vs. final quantization report\n",
-    "One important detail is that there are two reports printed, one named PRE-TRANSFORM_FOR_ANNOTATION QUANTIZATION REPORT,\n",
-    "and one named FINAL QUANTIZATION REPORT. This is related to the fact that some operators has to be decomposed before quantization to ensure\n",
-    "that all \"sub operators\" gets quantized properly. As an example, the division operator in the first report\n",
-    "has decomposed into a reciprocal and multiplication operator in the second. Had it not been marked for quantization\n",
-    "in the first step, it would have remained a single division operator.\n",
-    "\n",
-    "**This is important to be aware of when doing mixed quantization since this means that for an operator to be fully quantized,\n",
-    "both the original operator and the decomposition needs to be targeted.**\n",
-    "\n",
     "### SharedQspecQuantizer\n",
     "Last in the report there is always an additional quantizer applied which is not specified by the user, the SharedQspecQuantizer.\n",
     "It handles data shuffling operators without numerical behaviour such as copies and reshapes to ensure that they are quantized with the same qspec as\n",