executorch/backends/cortex_m/quantizer/quantization_configs.py at 55edd9e4afd73c9bbf722b0b338806ea244cba62 · xingguo01/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# Copyright 2025-2026 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import operator
from typing import Any, Callable

import torch
from executorch.backends.arm.quantizer.arm_quantizer_utils import (
    _get_int32_bias_qspec,
    _get_int32_per_channel_bias_qspec,
)
from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
from executorch.backends.cortex_m.quantizer.quantizer_reporter import (
    SUPPORTED_QCONFIGS,
    SUPPORTED_QSPECS,
)
from torch.fx import Node
from torchao.quantization.pt2e import (
    HistogramObserver,
    MinMaxObserver,
    PerChannelMinMaxObserver,
)
from torchao.quantization.pt2e.quantizer import (
    FixedQParamsQuantizationSpec,
    QuantizationSpec,
    QuantizationSpecBase,
    SharedQuantizationSpec,
)

# ----------------- QUANTIZATION SPEC PRESETS -----------------
INT8_WEIGHT_PER_TENSOR_QSPEC = QuantizationSpec(
    dtype=torch.int8,
    observer_or_fake_quant_ctr=MinMaxObserver,
    qscheme=torch.per_tensor_symmetric,
)

INT8_WEIGHT_PER_CHANNEL_QSPEC = QuantizationSpec(
    dtype=torch.int8,
    observer_or_fake_quant_ctr=PerChannelMinMaxObserver,
    qscheme=torch.per_channel_symmetric,
    ch_axis=0,
)

# For transpose conv, output channels are at axis 1 (IOHW format vs OIHW for regular conv)
INT8_WEIGHT_PER_CHANNEL_TRANSPOSE_QSPEC = QuantizationSpec(
    dtype=torch.int8,
    observer_or_fake_quant_ctr=PerChannelMinMaxObserver,
    qscheme=torch.per_channel_symmetric,
    ch_axis=1,
)

INT8_ACTIVATION_PER_TENSOR_QSPEC = QuantizationSpec(
    dtype=torch.int8,
    observer_or_fake_quant_ctr=HistogramObserver,
    qscheme=torch.per_tensor_affine,
)

INT8_ACTIVATION_PER_CHANNEL_QSPEC = QuantizationSpec(
    dtype=torch.int8,
    observer_or_fake_quant_ctr=PerChannelMinMaxObserver,
    qscheme=torch.per_channel_affine,
    ch_axis=0,
)

# Constants shared by Cortex-M quantized operators.
CMSIS_SOFTMAX_SCALE: float = 1.0 / 256.0
CMSIS_SOFTMAX_ZERO_POINT: int = -128

SOFTMAX_OUTPUT_FIXED_QSPEC = FixedQParamsQuantizationSpec(
    dtype=torch.int8,
    scale=CMSIS_SOFTMAX_SCALE,
    zero_point=CMSIS_SOFTMAX_ZERO_POINT,
    quant_min=-128,
    quant_max=127,
    qscheme=torch.per_tensor_affine,
)

SOFTMAX_TARGETS = {
    torch.ops.aten._softmax.default,
    torch.ops.aten.softmax.int,
}

CONV_TRANSPOSE_TARGETS = {
    torch.ops.aten.conv_transpose2d.input,
}

POOL_SHARE_OUTPUT_TARGETS = {
    torch.ops.aten.avg_pool2d.default,
    torch.ops.aten.max_pool2d.default,
    torch.ops.aten.max_pool2d_with_indices.default,
}

POOL_FUSED_ACTIVATION_TARGETS = {
    torch.ops.aten.relu.default,
    torch.ops.aten.relu_.default,
    torch.ops.aten.hardtanh.default,
    torch.ops.aten.hardtanh_.default,
    torch.ops.aten.clamp.default,
    torch.ops.aten.clamp_.default,
}


class CortexMQuantizationConfig(QuantizationConfig):
    """Configures quantization, while enforcing cortex-m specific constraints."""

    @staticmethod
    def _get_shared_pool_input(node: Node | None) -> Node | None:
        if node is None or len(node.args) == 0:
            return None

        input_node = node.args[0]
        if not isinstance(input_node, Node):
            return None

        if input_node.target in POOL_SHARE_OUTPUT_TARGETS:
            if len(input_node.args) > 0 and isinstance(input_node.args[0], Node):
                return input_node.args[0]
            return None

        if input_node.target == operator.getitem and len(input_node.args) > 0:
            pool_node = input_node.args[0]
            if (
                isinstance(pool_node, Node)
                and pool_node.target in POOL_SHARE_OUTPUT_TARGETS
                and len(pool_node.args) > 0
                and isinstance(pool_node.args[0], Node)
            ):
                return pool_node.args[0]

        return None

    def get_input_act_qspec(
        self, node: Node | None = None, input_node: Node | None = None
    ) -> QuantizationSpecBase | None:
        """
        Returns the configured input activation spec, no specific adjustments.
        """
        return super().get_input_act_qspec()

    def get_output_act_qspec(
        self, node: Node | None = None
    ) -> QuantizationSpecBase | None:
        """
        Returns the configured output activation spec with the following cortex-m specific adjustments:
        - For softmax, returns a fixed quantization spec matching CMSIS-NN requirements.
        - For pooling ops, returns a SharedQuantizationSpec to indicate that the output should share the same quantization parameters as the input.
        """
        if node is not None and node.target in SOFTMAX_TARGETS:
            if self.output_activation is None:
                return None
            return SOFTMAX_OUTPUT_FIXED_QSPEC
        if node is not None and node.target in POOL_SHARE_OUTPUT_TARGETS:
            if len(node.args) == 0:
                return super().get_output_act_qspec()
            input_node = node.args[0]
            if isinstance(input_node, Node):
                return SharedQuantizationSpec((input_node, node))
            return super().get_output_act_qspec()
        if node is not None and node.target in POOL_FUSED_ACTIVATION_TARGETS:
            shared_pool_input = self._get_shared_pool_input(node)
            if shared_pool_input is not None:
                return SharedQuantizationSpec(shared_pool_input)
        return super().get_output_act_qspec()

    def get_weight_qspec(self, node: Node | None = None) -> QuantizationSpecBase | None:
        """
        Returns the configured weight quantization spec with the following cortex-m specific adjustments:
        - For conv transpose, returns the per-channel quantization spec with ch_axis=1 to match the IOHW weight format used by CMSIS-NN, instead of the default ch_axis=0
        """
        weight_qspec = super().get_weight_qspec()
        if (
            node is not None
            and node.target in CONV_TRANSPOSE_TARGETS
            and weight_qspec is not None
            and isinstance(weight_qspec, QuantizationSpec)
            and weight_qspec.dtype == torch.int8
        ):
            return INT8_WEIGHT_PER_CHANNEL_TRANSPOSE_QSPEC
        return weight_qspec

    def get_bias_qspec(
        self, node: Node | None = None
    ) -> QuantizationSpecBase | Callable[[Any], Any] | None:
        """
        Returns the configured bias quantization spec, no specific adjustments.
        """
        if callable(self.bias) and node is not None:
            return self.bias(node)
        return super().get_bias_qspec(node)


# ----------------- QUANTIZATION CONFIG PRESETS -----------------
INT8_PER_TENSOR_CONFIG = CortexMQuantizationConfig(
    INT8_ACTIVATION_PER_TENSOR_QSPEC,
    INT8_ACTIVATION_PER_TENSOR_QSPEC,
    INT8_WEIGHT_PER_TENSOR_QSPEC,
    _get_int32_bias_qspec,
)


INT8_PER_CHANNEL_CONFIG = CortexMQuantizationConfig(
    INT8_ACTIVATION_PER_TENSOR_QSPEC,
    INT8_ACTIVATION_PER_TENSOR_QSPEC,
    INT8_WEIGHT_PER_CHANNEL_QSPEC,
    _get_int32_per_channel_bias_qspec,
)


# Register supported quantization configs and qspecs in the reporter for human-readable reporting
# MLETORCH-1854: Temporary solution, refactor to automatically register these instead
SUPPORTED_QCONFIGS.update(
    {
        INT8_PER_CHANNEL_CONFIG: f"{__name__}.INT8_PER_CHANNEL_QCONFIG",
        INT8_PER_TENSOR_CONFIG: f"{__name__}.INT8_PER_TENSOR_QCONFIG",
    }
)

SUPPORTED_QSPECS.update(
    {
        INT8_ACTIVATION_PER_TENSOR_QSPEC: "INT8_ACTIVATION_PER_TENSOR_QSPEC",
        INT8_ACTIVATION_PER_CHANNEL_QSPEC: "INT8_ACTIVATION_PER_CHANNEL_QSPEC",
        INT8_WEIGHT_PER_TENSOR_QSPEC: "INT8_WEIGHT_PER_TENSOR_QSPEC",
        INT8_WEIGHT_PER_CHANNEL_QSPEC: "INT8_WEIGHT_PER_CHANNEL_QSPEC",
        INT8_WEIGHT_PER_CHANNEL_TRANSPOSE_QSPEC: "INT8_WEIGHT_PER_CHANNEL_TRANSPOSE_QSPEC",
        SOFTMAX_OUTPUT_FIXED_QSPEC: "SOFTMAX_OUTPUT_FIXED_QSPEC",
    }
)