Arm backend: Make quantization of infs user configurable (pytorch#19915)

martinlsm · web-flow · commit feb84f861613 · 2026-06-02T08:59:08.000+02:00
Add `QuantizeInfConfig` to the Arm pass pipeline config so compile specs
can set the finite values used to quantize infinities.

Signed-off-by: Martin Lindström &lt;Martin.Lindstroem@arm.com&gt;
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -150,10 +150,7 @@
 )
 from executorch.backends.arm._passes.arm_pass import ArmPass
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.common.pipeline_config import (
-    ArmPassPipelineConfig,
-    SoftmaxDecompositionConfig,
-)
+from executorch.backends.arm.common.pipeline_config import SoftmaxDecompositionConfig
 from executorch.backends.arm.tosa.specification import (
     tosa_spec_in_set,
     TosaLoweringContext,
@@ -221,16 +218,13 @@ def __init__(self, compile_spec: ArmCompileSpec) -> None:
         super().__init__()
         self.configure_skip_passes()
 
-    def configure_skip_passes(
-        self,
-        override_config: ArmPassPipelineConfig | None = None,
-    ) -> tuple[type, ...]:
+    def configure_skip_passes(self) -> tuple[type, ...]:
         """Configures the pass manager to skip certain passes based on the
         ArmPassPipelineConfig class found in the compile spec.
         """
         skip_set: set[type] = set()
 
-        config = override_config or self.compile_spec._get_pass_pipeline_config()
+        config = self.compile_spec._get_pass_pipeline_config()
         logger.debug(f"Skip Config: {config}")
 
         match config.softmax:
@@ -649,9 +643,14 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
             )
 
             # Postprocessing passes
+            quant_inf_cfg = self.compile_spec._get_pass_pipeline_config().quantize_inf
             self.add_passes(
                 [
-                    ReplaceInfAndLimitValuesPass(tfa_pass=True),
+                    ReplaceInfAndLimitValuesPass(
+                        quant_inf_cfg.neg_inf,
+                        quant_inf_cfg.pos_inf,
+                        tfa_pass=True,
+                    ),
                     DecomposeMaskedFillPass(tfa_pass=True),
                     DeduplicateGetAttrPass(tfa_pass=True),
                 ]
diff --git a/backends/arm/_passes/replace_inf_and_limit_values_pass.py b/backends/arm/_passes/replace_inf_and_limit_values_pass.py
@@ -16,12 +16,22 @@
 
 class ReplaceInfAndLimitValuesPass(ArmPass):
     """Rewrites +inf/-inf and floating-point limit values (e.g.,
-    torch.finfo(...).min/max) to quantization-friendly values (±255 by default),
+    torch.finfo(...).min/max) to configured quantization-friendly values,
     improving quantizer stability (notably for attention mask paths).
     """
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
+    def __init__(
+        self,
+        neg_inf: float,
+        pos_inf: float,
+        tfa_pass: bool = False,
+    ):
+        super().__init__(tfa_pass=tfa_pass)
+        self.neg_inf = neg_inf
+        self.pos_inf = pos_inf
+
     def _allowed_to_transform_named_buffer(self, buf_name, graph_module) -> bool:
         attr_nodes = [
             node
@@ -51,19 +61,19 @@ def call(self, graph_module: torch.fx.GraphModule):
                 continue
 
             modified = True
-            # 255 here is mainly for attention_mask in Llama for reasonable quant scale
-            t = torch.nan_to_num(tensor, posinf=255, neginf=-255)
+
+            t = torch.nan_to_num(tensor, posinf=self.pos_inf, neginf=self.neg_inf)
             setattr(graph_module, buf_name, t)
 
         for node in graph_module.graph.nodes:
             arg_list = list(node.args)
             for index, arg in enumerate(arg_list):
                 if arg == float("-inf") or arg == torch.finfo(torch.float32).min:
                     modified = True
-                    arg_list[index] = -255.0
+                    arg_list[index] = self.neg_inf
                 elif arg == float("inf") or arg == torch.finfo(torch.float32).max:
                     modified = True
-                    arg_list[index] = +255.0
+                    arg_list[index] = self.pos_inf
             node.args = tuple(arg_list)
 
         if modified:
diff --git a/backends/arm/common/pipeline_config.py b/backends/arm/common/pipeline_config.py
@@ -4,25 +4,75 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
-from dataclasses import dataclass, fields
+from dataclasses import asdict, dataclass, field, fields, is_dataclass
 from enum import auto, Enum
-from typing import Any
+from typing import Any, cast
 
 
 class SoftmaxDecompositionConfig(Enum):
     MASKED = auto()  # Stable softmax + masked fill decomposition
     STABLE = auto()  # Stable softmax, no masked fill decomposition
 
 
+@dataclass
+class QuantizeInfConfig:
+    """Replacement values for infinities before quantization.
+
+    Infinities cannot be quantized directly, so the Arm pipeline replaces them
+    with finite values before running the quantization passes.
+
+    Args:
+        neg_inf (float): Value used for ``-inf``.
+        pos_inf (float): Value used for ``inf``.
+
+    """
+
+    neg_inf: float = -256.0
+    pos_inf: float = 255.0
+
+
 @dataclass
 class ArmPassPipelineConfig:
+    """Options for tuning the Arm pass pipeline.
+
+    Args:
+        softmax (SoftmaxDecompositionConfig): Softmax decomposition mode.
+        quantize_inf (QuantizeInfConfig): Values used when replacing
+            infinities before quantization.
+
+    Example:
+        compile_spec.set_pass_pipeline_config(
+            ArmPassPipelineConfig(
+                softmax=SoftmaxDecompositionConfig.STABLE,
+                quantize_inf=QuantizeInfConfig(
+                    neg_inf=-100.0,
+                    pos_inf=100.0,
+                ),
+            )
+        )
+
+    """
+
     softmax: SoftmaxDecompositionConfig = SoftmaxDecompositionConfig.MASKED
+    quantize_inf: QuantizeInfConfig = field(default_factory=QuantizeInfConfig)
 
     def is_default(self) -> bool:
-        return self.softmax is SoftmaxDecompositionConfig.MASKED
+        return (
+            self.softmax is SoftmaxDecompositionConfig.MASKED
+            and self.quantize_inf == QuantizeInfConfig()
+        )
 
-    def to_dict(self) -> dict[str, str]:
-        return {f.name: getattr(self, f.name).name for f in fields(self)}
+    def to_dict(self) -> dict[str, Any]:
+        data: dict[str, Any] = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            if is_dataclass(value):
+                data[f.name] = asdict(cast(Any, value))
+            elif isinstance(value, Enum):
+                data[f.name] = value.name
+            else:
+                raise AssertionError(f"Cannot serialize {f.name}")
+        return data
 
     @classmethod
     def from_dict(cls, data: dict[str, Any]) -> "ArmPassPipelineConfig":
@@ -31,8 +81,13 @@ def from_dict(cls, data: dict[str, Any]) -> "ArmPassPipelineConfig":
             raw_value = data.get(f.name)
             if raw_value is None:
                 continue
-            enum_type = f.type
-            setattr(config, f.name, enum_type[raw_value])
+
+            if f.name == "quantize_inf":
+                config.quantize_inf = QuantizeInfConfig(**raw_value)
+            else:
+                # The field is an enum
+                enum_type = f.type
+                setattr(config, f.name, enum_type[raw_value])
         return config
 
     def serialize(self) -> bytes:
diff --git a/backends/arm/test/misc/test_pass_pipeline_config.py b/backends/arm/test/misc/test_pass_pipeline_config.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
+
 from executorch.backends.arm._passes import (
     DecomposeMaskedFillPass,
     DecomposeSoftmaxPass,
@@ -11,10 +13,26 @@
 from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 from executorch.backends.arm.common.pipeline_config import (
     ArmPassPipelineConfig,
+    QuantizeInfConfig,
     SoftmaxDecompositionConfig,
 )
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.specification import TosaSpecification
+from torch.export import export
+
+
+class ModuleWithInf(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.register_buffer(
+            "mask", torch.tensor([float("inf"), float("-inf")], dtype=torch.float32)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.mask  # type: ignore[operator]
+        x = torch.ops.aten.add.Tensor(x, float("-inf"))
+        x = torch.ops.aten.add.Tensor(x, float("inf"))
+        return x
 
 
 def test_pipeline_config_override_outside_compile_spec():
@@ -68,3 +86,27 @@ def test_softmax_config_stable_no_target():
     assert DecomposeSoftmaxPass not in skip_passes
     # STABLE: masked fill decomposition is disabled (skipped)
     assert DecomposeMaskedFillPass in skip_passes
+
+
+def test_quant_inf_config_reaches_annotation_pipeline():
+    QUANT_NEG_INF = -321.0
+    QUANT_POS_INF = 123.0
+
+    config = ArmPassPipelineConfig(
+        quantize_inf=QuantizeInfConfig(neg_inf=QUANT_NEG_INF, pos_inf=QUANT_POS_INF),
+    )
+    compile_spec = TosaCompileSpec(
+        TosaSpecification.create_from_string("TOSA-1.00+INT")
+    )
+    compile_spec.set_pass_pipeline_config(config)
+    manager = ArmPassManager(compile_spec)
+    exported = export(ModuleWithInf(), (torch.zeros(2),), strict=True)
+
+    transformed = manager.transform_for_annotation_pipeline(exported.graph_module)
+    tensor_constant_values = sorted(
+        constant.item()
+        for name, constant in transformed.named_buffers()
+        if name.startswith("_tensor_constant")
+    )
+
+    assert tensor_constant_values == [QUANT_NEG_INF, QUANT_POS_INF]
diff --git a/backends/arm/test/passes/test_replace_inf_values_pass.py b/backends/arm/test/passes/test_replace_inf_values_pass.py
@@ -49,26 +49,41 @@ def _get_mask_buffer(graph_module: fx.GraphModule) -> torch.Tensor:
 
 def test_replace_inf_and_limit_values_clamps_inf_constants():
     """Trace a module with infinities, run ReplaceInfAndLimitValuesPass, and
-    expect the buffer and scalar literals to be clamped to ±255 with no
-    infinities left.
+    expect the buffer and scalar literals to be clamped to the configured finite
+    values.
     """
+    QUANTIZED_NEG_INF = -42.0
+    QUANTIZED_POS_INF = 13.0
+
     gm = fx.symbolic_trace(ModuleWithInf())
 
-    result = ReplaceInfAndLimitValuesPass().call(gm)
+    result = ReplaceInfAndLimitValuesPass(
+        neg_inf=QUANTIZED_NEG_INF,
+        pos_inf=QUANTIZED_POS_INF,
+    ).call(gm)
     mask_after_pass = _get_mask_buffer(result.graph_module)
 
     assert result.modified
-    expected = torch.tensor([255.0, -255.0], dtype=mask_after_pass.dtype)
+    expected = torch.tensor(
+        [QUANTIZED_POS_INF, QUANTIZED_NEG_INF],
+        dtype=mask_after_pass.dtype,
+    )
     assert torch.equal(mask_after_pass, expected)
     assert not torch.isinf(mask_after_pass).any()
-    assert sorted(_get_add_constants(result.graph_module)) == [-255, 255]
+    assert sorted(_get_add_constants(result.graph_module)) == [
+        QUANTIZED_NEG_INF,
+        QUANTIZED_POS_INF,
+    ]
 
 
 def test_replace_inf_and_limit_values_respects_disallowed_nodes():
     """When nodes opt out of transforms, running the pass in TFA mode should
-    leave the mask buffer untouched while still clamping scalar literals to
-    ±255.
+    leave the mask buffer untouched while still clamping scalar literals to the
+    configured finite values.
     """
+    QUANTIZED_NEG_INF = -1_000_000.0
+    QUANTIZED_POS_INF = 10_000.0
+
     gm = fx.symbolic_trace(ModuleWithInf())
     mask_before = _get_mask_buffer(gm).clone()
 
@@ -82,7 +97,10 @@ def test_replace_inf_and_limit_values_respects_disallowed_nodes():
         ):
             node.meta[DISALLOW_TFA_META_KEY] = True
 
-    replace_inf = ReplaceInfAndLimitValuesPass()
+    replace_inf = ReplaceInfAndLimitValuesPass(
+        neg_inf=QUANTIZED_NEG_INF,
+        pos_inf=QUANTIZED_POS_INF,
+    )
     replace_inf.is_tfa_pass = True
 
     result = replace_inf.call(gm)
@@ -91,4 +109,7 @@ def test_replace_inf_and_limit_values_respects_disallowed_nodes():
     mask_after = _get_mask_buffer(result.graph_module)
     assert torch.equal(mask_after, mask_before)
     assert torch.isinf(mask_after).tolist() == [True, True]
-    assert sorted(_get_add_constants(result.graph_module)) == [-255, 255]
+    assert sorted(_get_add_constants(result.graph_module)) == [
+        QUANTIZED_NEG_INF,
+        QUANTIZED_POS_INF,
+    ]