Qualcomm AI Engine Direct - Enable per-channel quantization for embedding op (#18433)

shewu-quic · web-flow · commit d75e66575851 · 2026-03-31T14:01:01.000-05:00
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
@@ -227,7 +227,9 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
         # skip dequantize op, e.g. frozen_param -> dq -> conv2d
         user_0 = self.get_first_user(node)
         # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO
-        if user_0.target == exir_ops.edge.aten.convolution.default:
+        if user_0.target in {
+            exir_ops.edge.aten.convolution.default,
+        }:
             quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
         else:
             quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
diff --git a/backends/qualcomm/builders/op_embedding.py b/backends/qualcomm/builders/op_embedding.py
@@ -9,11 +9,22 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_DATA,
+    QCOM_DTYPE,
+    QCOM_ENCODING,
+    QCOM_QUANT_ATTRS,
+    QCOM_QUANT_MAX,
+    QCOM_QUANT_MIN,
+    QCOM_SCALE,
+    QCOM_SCALES,
+    QCOM_ZERO_POINT,
+    QCOM_ZERO_POINTS,
+)
 
-from .node_visitor import NodeVisitor
+from .node_visitor import NodeVisitor, PER_CHANNEL_ENCODING, QNN_QUANT_TYPE_MAP
 from .node_visitor_manager import register_node_visitor
-from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
+from .qnn_constants import OpConvert, OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
 from .utils import get_parameter
 
 
@@ -30,6 +41,9 @@ def define_node(
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnManager.TensorWrapper],
     ) -> PyQnnManager.PyQnnOpWrapper:
         weight_node = self.get_node(node.args[0])
+        is_pcq_embedding = QCOM_QUANT_ATTRS in weight_node.meta and weight_node.meta[
+            QCOM_QUANT_ATTRS
+        ][QCOM_ENCODING] in (PER_CHANNEL_ENCODING)
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -52,17 +66,41 @@ def define_node(
         gather_input_tensors = [weight_tensor_wrapper, indices_tensor_wrapper]
 
         output_tensor = self.get_tensor(node, node)
+        node_name = node.name
+        if is_pcq_embedding:
+            node_quant_attrs = node.meta[QCOM_QUANT_ATTRS].copy()
+            intermediate_quant_attrs = node.meta[QCOM_QUANT_ATTRS].copy()
+            # Based on QNN HTP quantization constraints,
+            # we should set the scale to max of scales and per-tensor quantization for embedding op
+            intermediate_quant_attrs[QCOM_SCALE] = (
+                weight_node.meta[QCOM_QUANT_ATTRS][QCOM_SCALES].max().item()
+            )
+            intermediate_quant_attrs[QCOM_ZERO_POINT] = (
+                weight_node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINTS].max().item()
+            )
+            intermediate_quant_attrs[QCOM_DTYPE] = weight_node.meta[QCOM_QUANT_ATTRS][
+                QCOM_DTYPE
+            ]
+            intermediate_quant_attrs[QCOM_QUANT_MAX] = weight_node.meta[
+                QCOM_QUANT_ATTRS
+            ][QCOM_QUANT_MAX]
+            intermediate_quant_attrs[QCOM_QUANT_MIN] = weight_node.meta[
+                QCOM_QUANT_ATTRS
+            ][QCOM_QUANT_MIN]
+            node.meta[QCOM_QUANT_ATTRS] = intermediate_quant_attrs
+            node_name += "_intermediate"
         output_tensor_wrapper = self.define_tensor(
             node,
             node,
             output_tensor,
             PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
+            node_name=node_name,
         )
         gather_output_tensors = [output_tensor_wrapper]
 
         gather_op = PyQnnManager.PyQnnOpWrapper(
-            node.name,
+            node_name,
             QNN_OP_PACKAGE_NAME_QTI_AISW,
             OpGather.op_name,
         )
@@ -76,4 +114,36 @@ def define_node(
             {QCOM_DATA: np.int32(0)},
         )
 
-        return gather_op
+        op_wrapper_list = [gather_op]
+
+        if is_pcq_embedding:
+            node.meta[QCOM_QUANT_ATTRS] = node_quant_attrs
+            act_quant_encoding, act_quant_configs = self.get_quant_encoding_conf(
+                node, node
+            )
+            act_dtype = (
+                torch.uint16
+                if act_quant_configs[QCOM_DTYPE] == torch.int32
+                else act_quant_configs[QCOM_DTYPE]
+            )
+            convert_tensor_wrapper = self.define_custom_tensor_wrapper(
+                node_name=node.name,
+                tensor_type=PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                dtype=QNN_QUANT_TYPE_MAP[act_dtype],
+                quant_encoding=act_quant_encoding,
+                quant_configs=act_quant_configs,
+                dims=output_tensor.size(),
+                tensor=output_tensor,
+                is_fake_tensor=True,
+                nodes_to_wrappers=nodes_to_wrappers,
+            )
+            convert_op = PyQnnManager.PyQnnOpWrapper(
+                node.name + "_convert",
+                QNN_OP_PACKAGE_NAME_QTI_AISW,
+                OpConvert.op_name,
+            )
+            convert_op.AddInputTensors(gather_output_tensors)
+            convert_op.AddOutputTensors([convert_tensor_wrapper])
+            op_wrapper_list.append(convert_op)
+
+        return op_wrapper_list
diff --git a/backends/qualcomm/quantizer/annotators/htp_rules.py b/backends/qualcomm/quantizer/annotators/htp_rules.py
@@ -535,14 +535,13 @@ class Elu(GeneralOpDef):
 # TODO: Embedding op cannot directly map to OpGather because the index input in torch is not a tensor.
 @register_annotator(
     [
-        torch.ops.aten.embedding.default,
         torch.ops.aten.gather.default,
         torch.ops.aten.index.Tensor,
         torch.ops.aten.index_select.default,
     ],
     qnn_op=None,
 )
-class Embedding(GeneralOpDef):
+class Gather(GeneralOpDef):
     @staticmethod
     def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
         # args[2] = indices, which should be int
@@ -551,6 +550,40 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
             annotate_single_in_share_out(node, quantization_config)
 
 
+@register_annotator(
+    [
+        torch.ops.aten.embedding.default,
+    ],
+    qnn_op=None,
+)
+class Embedding(GeneralOpDef):
+    @staticmethod
+    def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
+        weight = node.args[0]
+
+        # Only quantize if input is a float tensor
+        if _is_annotated([node]) or not _is_float_tensor(weight):
+            return
+
+        is_pcq_embedding = quantization_config.per_channel_embedding
+        input_qspec_map = {}
+        input_qspec_map[weight] = (
+            quantization_config.weight
+            if is_pcq_embedding
+            else quantization_config.input_activation
+        )
+        output_qspec = (
+            quantization_config.input_activation
+            if is_pcq_embedding
+            else SharedQuantizationSpec((weight, node))
+        )
+        node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_qspec,
+            _annotated=True,
+        )
+
+
 @register_annotator([torch.ops.aten.eq.Tensor], QnnConstants.OpElementWiseEqual.op_name)
 class Equal(GeneralOpDef):
     @staticmethod
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
@@ -39,6 +39,7 @@ class QuantizationConfig:
     weight: Optional[QuantizationSpec]
     bias: Optional[QuantizationSpec | Callable]
     block_size: Optional[Tuple] = None
+    per_channel_embedding: bool = False
 
 
 def _derived_bias_quant_spec(node: Node) -> DerivedQuantizationSpec:
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
@@ -174,6 +174,7 @@ class ModuleQConfig:
     is_qat: bool = False
     is_conv_per_channel: bool = False
     is_linear_per_channel: bool = False
+    is_embedding_per_channel: bool = False
     act_observer: Optional[UniformQuantizationObserverBase] = None
     act_symmetric: bool = False
     eps: Optional[float] = None
@@ -226,6 +227,7 @@ def __post_init__(self):
             torch.ops.aten.conv_transpose2d.input: 1,
             torch.ops.aten.conv_transpose3d.input: 1,
             torch.ops.aten.linear.default: 0,
+            torch.ops.aten.embedding.default: 0,
         }
 
         self.use_per_channel_weight_quant_ops = {}
@@ -245,6 +247,17 @@ def __post_init__(self):
             self.use_per_channel_weight_quant_ops.update(
                 {k: self.op_axis_dict[k] for k in linear_ops if k in self.op_axis_dict}
             )
+        if self.is_embedding_per_channel:
+            embedding_ops = [torch.ops.aten.embedding.default]
+            self.use_per_channel_weight_quant_ops.update(
+                {
+                    k: self.op_axis_dict[k]
+                    for k in embedding_ops
+                    if k in self.op_axis_dict
+                }
+            )
+            for pcq_config in self.per_channel_quant_config_list:
+                pcq_config.per_channel_embedding = True
 
         if per_block_quant_config_func:
             self.per_block_quant_config_list = []
@@ -533,6 +546,7 @@ def set_default_quant_config(
         is_qat=False,
         is_conv_per_channel=False,
         is_linear_per_channel=False,
+        is_embedding_per_channel=False,
         act_observer=None,
         act_symmetric=False,
         eps=None,
@@ -545,6 +559,7 @@ def set_default_quant_config(
             is_qat (bool, optional): Enables Quantization-Aware Training (QAT) mode. Defaults to Post-Training Quantization (PTQ) mode.
             is_conv_per_channel (bool, optional): Enables per-channel quantization for convolution operations.
             is_linear_per_channel (bool, optional): Enables per-channel quantization for linear (fully connected) operations.
+            is_embedding_per_channel (bool, optional): Enables per-channel quantization for embedding operations.
             act_observer (Optional[UniformQuantizationObserverBase], optional): Custom observer for activation quantization. If not specified, the default observer is determined by `QUANT_CONFIG_DICT`.
 
         """
@@ -553,6 +568,7 @@ def set_default_quant_config(
             is_qat=is_qat,
             is_conv_per_channel=is_conv_per_channel,
             is_linear_per_channel=is_linear_per_channel,
+            is_embedding_per_channel=is_embedding_per_channel,
             act_observer=act_observer,
             act_symmetric=act_symmetric,
             eps=eps,
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3089,6 +3089,19 @@ def test_qnn_backend_embedding(self):
                 )
                 self.lower_module_and_test_output(modules[i], sample_input)
 
+    # TODO: Once the accuracy issue is fixed, enable this test.
+    @unittest.skip("Bad accuracy for HTP")
+    def test_qnn_backend_embedding_per_channel(self):
+        module = Embedding()  # noqa: F405
+        sample_input = (torch.Tensor([1, 2, 4, 5]).to(torch.int32),)
+        qdq_module = self.get_qdq_module(
+            module,
+            sample_input,
+            quant_dtype=QuantDtype.use_16a8w,
+            is_embedding_per_channel=True,
+        )
+        self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_equal(self):
         test_comb = [
             {
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -643,6 +643,7 @@ def get_qdq_module(
         inputs: Tuple[torch.Tensor],
         is_conv_per_channel: Optional[bool] = True,
         is_linear_per_channel: Optional[bool] = False,
+        is_embedding_per_channel: Optional[bool] = False,
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
         dynamic_shapes: Dict = None,
@@ -659,6 +660,7 @@ def get_qdq_module(
             custom_annotations=custom_quant_annotations,
             per_channel_conv=is_conv_per_channel,
             per_channel_linear=is_linear_per_channel,
+            per_channel_embedding=is_embedding_per_channel,
             submodule_qconfig_list=submodule_qconfig_list,
             backend=get_backend_type(self.backend),
             soc_model=self.model,
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
@@ -366,6 +366,7 @@ def make_quantizer(
     custom_annotations=(),
     per_channel_conv=True,
     per_channel_linear=False,
+    per_channel_embedding=False,
     act_observer=MovingAverageMinMaxObserver,
     act_symmetric=False,
     is_qat=False,
@@ -381,6 +382,7 @@ def make_quantizer(
         is_qat=is_qat,
         is_conv_per_channel=per_channel_conv,
         is_linear_per_channel=per_channel_linear,
+        is_embedding_per_channel=per_channel_embedding,
         act_observer=act_observer,
         act_symmetric=act_symmetric,
         eps=eps,