Move clamp to independent quantization in annotator (#17910)

JakeStevens · facebook-github-bot · commit 1994514bbc3e · 2026-03-06T13:03:24.000-08:00
Summary:

The clamp operation was incorrectly placed in `_one_to_one_shared_input_qspec`,
which causes the input and output observers to be shared. This is problematic
because clamp explicitly modifies the value range by enforcing min/max bounds.

When using clamp to prevent undefined behavior (e.g., clamping inputs to rsqrt
to be positive), the pre-clamp and post-clamp ranges can be very different.
With shared observers, the pre-clamp (smaller) values dominate the min_val,
causing incorrect quantization parameters for the post-clamp tensor.

This fix moves clamp to `_one_to_one`, giving it independent input/output
quantization so each observer properly tracks its respective range.

Differential Revision: D92408418
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -11,6 +11,7 @@
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
     get_param_tensor,
     is_param_node,
     set_node_arg,
@@ -347,53 +348,74 @@ def call(self, graph_module: GraphModule) -> PassResult:  # noqa: C901
 
 
 class QuantizeClampArgumentsPass(ArmPass):
-    """This pass makes sure that the arguments to clamp.default are quantized
-    correctly.
+    """This pass quantizes the scalar min/max arguments of clamp.default and
+    inserts a RESCALE when the input and output quantization parameters differ.
 
-    More specifically, this pass:
-        - Makes sure the min and max values to clamp.default are quantized, if it's a quantized operator.
+    When clamp has independent input/output quantization (different scales),
+    a RESCALE is inserted before the clamp to convert the input from the
+    input domain to the output domain. The min/max bounds are quantized
+    using the output quantization parameters, ensuring they are precise
+    even when the clamp range is much narrower than the input range.
 
     """
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
-        # Loop over the graph nodes and find full.default nodes.
         for n in graph_module.graph.nodes:
             n = cast(Node, n)
-            if n.target not in {
-                exir_ops.edge.aten.clamp.default,
-            }:
+            if n.target != exir_ops.edge.aten.clamp.default:
                 continue
 
             try:
+                input_qparams = get_input_qparams(n)
                 output_qparams = get_output_qparams(n)
             except ValueError:
                 continue
-            if len(output_qparams) == 0:
+            if len(input_qparams) == 0 or len(output_qparams) == 0:
                 continue
 
-            # Qparams are stored per user index; use the first entry.
-            qargs = next(iter(output_qparams.values()))
+            input_qargs = next(iter(input_qparams.values()))
+            output_qargs = next(iter(output_qparams.values()))
+
+            if input_qargs != output_qargs:
+                input_node = n.args[0]
+                with graph_module.graph.inserting_before(n):
+                    rescale_node = create_node(
+                        graph_module.graph,
+                        exir_ops.backend.tosa.RESCALE.default,
+                        (
+                            input_node,
+                            output_qargs.dtype,
+                            [
+                                input_qargs.get_scale_per_tensor()
+                                / output_qargs.get_scale_per_tensor()
+                            ],
+                            input_qargs.get_zp_per_tensor(),
+                            output_qargs.get_zp_per_tensor(),
+                        ),
+                        from_node=n,
+                    )
+                n.replace_input_with(input_node, rescale_node)
+                n.meta["input_qparams"] = {0: output_qargs}
+
+            qargs = output_qargs
 
-            if n.target == exir_ops.edge.aten.clamp.default:
-                # Quantize the min and max arguments of clamp, if they are not None
-                min_val = n.args[1]
-                max_val = None if len(n.args) <= 2 else n.args[2]
+            min_val = n.args[1]
+            max_val = None if len(n.args) <= 2 else n.args[2]
 
-                if min_val is not None:
-                    quantized_min_val = qargs.quantize_value(min_val).item()
-                    n.update_arg(1, quantized_min_val)
+            if min_val is not None:
+                quantized_min_val = qargs.quantize_value(min_val).item()
+                n.update_arg(1, quantized_min_val)
 
-                if max_val is not None:
-                    quantized_max_val = qargs.quantize_value(max_val).item()
-                    n.update_arg(2, quantized_max_val)
+            if max_val is not None:
+                quantized_max_val = qargs.quantize_value(max_val).item()
+                n.update_arg(2, quantized_max_val)
 
-                modified = True
+            modified = True
 
         if modified:
-            # Retrace to refresh fake tensor metadata after updating clamp min/max.
             graph_module = super().call(graph_module).graph_module
             graph_module.recompile()
 
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -433,6 +433,10 @@ def _match_pattern(
     torch.ops.aten.acos.default,
     torch.ops.aten.cumsum.default,
     torch.ops.aten.tan.default,
+    # Clamp modifies the value range (enforces min/max bounds), so it needs
+    # independent input/output quantization to properly track the clamped range.
+    torch.ops.aten.clamp.default,
+    torch.ops.aten.clamp.Tensor,
 ]
 
 _one_to_one_shared_input_qspec = [
@@ -480,8 +484,6 @@ def _match_pattern(
     torch.ops.aten.pad.default,
     torch.ops.aten.amax.default,
     torch.ops.aten.amin.default,
-    torch.ops.aten.clamp.default,
-    torch.ops.aten.clamp.Tensor,
     torch.ops.aten.unflatten.int,
     torch.ops.aten.gather.default,
     torch.ops.aten.unfold_copy.default,
diff --git a/backends/arm/test/quantizer/test_clamp_quantization.py b/backends/arm/test/quantizer/test_clamp_quantization.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Verify that clamp uses independent input/output quantization.
+
+Clamp modifies the value range by enforcing min/max bounds, so its output
+observer must be independent from its input observer. When observers are
+shared, the pre-clamp (wider) values dominate the observed range and the
+post-clamp tensor gets incorrect quantization parameters.
+
+This test feeds a wide-range input through a narrow clamp and checks that
+the quantization scale for the clamp output differs from the input scale.
+"""
+
+import torch
+from executorch.backends.arm.quantizer import (
+    get_symmetric_quantization_config,
+    TOSAQuantizer,
+)
+from executorch.backends.arm.tosa import TosaSpecification
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+Q_PER_TENSOR = torch.ops.quantized_decomposed.quantize_per_tensor.default
+DQ_PER_TENSOR = torch.ops.quantized_decomposed.dequantize_per_tensor.default
+
+
+class ClampModel(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.clamp(x, min=0.0, max=1.0)
+
+
+def test_clamp_has_different_input_output_qparams():
+    """Input and output scales must differ when clamp narrows the range.
+
+    A wide-range input ([-50, 50]) clamped to [0, 1] should produce a much
+    smaller output scale than input scale, because the output observer only
+    sees values in [0, 1] while the input observer sees the full [-50, 50].
+
+    Before the fix (clamp in _one_to_one_shared_input_qspec), both observers
+    were shared and would produce identical scales — the wider input range
+    dominated, wasting output precision.
+    """
+    model = ClampModel()
+    model.eval()
+
+    # Use deterministic wide-range calibration data so the input observer
+    # sees [-50, 50] while the output observer sees only [0, 1].
+    calibration_input = torch.linspace(-50, 50, 200).reshape(1, 200)
+
+    tosa_spec = TosaSpecification.create_from_string("TOSA-1.0+INT")
+    quantizer = TOSAQuantizer(tosa_spec)
+    quantizer.set_global(get_symmetric_quantization_config(is_per_channel=False))
+
+    exported = torch.export.export(model, (calibration_input,))
+    prepared = prepare_pt2e(exported.module(), quantizer)
+    prepared(calibration_input)
+    converted = convert_pt2e(prepared)
+
+    # After conversion the graph has explicit quantize/dequantize nodes.
+    # For clamp with independent qspecs the pattern is:
+    #   dequantize_per_tensor(input_scale) -> clamp -> quantize_per_tensor(output_scale)
+    # With shared qspecs both scales would be identical.
+    clamp_nodes = [
+        n
+        for n in converted.graph.nodes
+        if n.target in (torch.ops.aten.clamp.default, torch.ops.aten.clamp.Tensor)
+    ]
+    assert (
+        len(clamp_nodes) == 1
+    ), f"Expected exactly 1 clamp node, found {len(clamp_nodes)}"
+    clamp_node = clamp_nodes[0]
+
+    # Get the dequant feeding clamp's input — its scale is arg[1].
+    input_dq = clamp_node.args[0]
+    assert (
+        input_dq.target == DQ_PER_TENSOR
+    ), f"Expected dequantize_per_tensor before clamp, got {input_dq.target}"
+    input_scale = float(input_dq.args[1])
+
+    # Get the quant consuming clamp's output — its scale is arg[1].
+    clamp_users = list(clamp_node.users)
+    assert (
+        len(clamp_users) == 1
+    ), f"Expected exactly 1 user of clamp, found {len(clamp_users)}"
+    output_q = clamp_users[0]
+    assert (
+        output_q.target == Q_PER_TENSOR
+    ), f"Expected quantize_per_tensor after clamp, got {output_q.target}"
+    output_scale = float(output_q.args[1])
+
+    # With independent quantization the output scale (tracking [0, 1]) must
+    # be much smaller than the input scale (tracking [-50, 50]).
+    assert output_scale < input_scale, (
+        f"Clamp output scale ({output_scale}) should be smaller than input "
+        f"scale ({input_scale}) because clamp narrows [−50, 50] → [0, 1]. "
+        "If they are equal, clamp is using shared observers (bug)."
+    )
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
@@ -36,6 +36,7 @@ def define_arm_tests():
     # Quantization
     test_files += [
         "quantizer/test_generic_annotater.py",
+        "quantizer/test_clamp_quantization.py",
     ]
 
     # Misc tests

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ def define_arm_tests():`
`36`	`36`	`# Quantization`
`37`	`37`	`test_files += [`
`38`	`38`	`"quantizer/test_generic_annotater.py",`
	`39`	`+ "quantizer/test_clamp_quantization.py",`
`39`	`40`	`]`
`40`	`41`
`41`	`42`	`# Misc tests`