Qualcomm AI Engine Direct - Fix MobilenetV3 and Stack Layout Transform (pytorch#16686)

winskuo-quic · web-flow · commit 5ec362a9300a · 2026-01-21T08:46:08.000-08:00
### Summary - Resolve Mainline Issue: pytorch#16616 (comment) - Fix MobileNetV3 Accuracy Issue - Support drawing pydot graph so we can draw LLM models. SVG will stuck when drawing LLM models. ### Test plan UT added.
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
@@ -124,10 +124,8 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.sqrt.default,
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,
-        exir_ops.edge.aten.stack.default,
         exir_ops.edge.aten.topk.default,
         exir_ops.edge.aten._to_copy.default,
-        exir_ops.edge.aten.unbind.int,
         exir_ops.edge.aten.where.self,
         _operator.getitem,
         torch.ops.aten.scalar_tensor.default,
diff --git a/backends/qualcomm/_passes/seq_mse.py b/backends/qualcomm/_passes/seq_mse.py
@@ -56,7 +56,8 @@ def _make_operator(self, aten_op):
             groups = 1 if len(aten_op.args) < 7 else aten_op.args[6]
             has_bias = self.nominal_bias is not None
             module = torch.nn.Conv2d(
-                in_channels=self.nominal_weight.shape[1],
+                in_channels=self.nominal_weight.shape[1]
+                * groups,  # equivalent to input_tensor.shape[1]
                 out_channels=self.nominal_weight.shape[0],
                 kernel_size=self.nominal_weight.shape[-2:],
                 stride=stride,
diff --git a/backends/qualcomm/builders/op_stack.py b/backends/qualcomm/builders/op_stack.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
 from .node_visitor import NodeVisitor
 from .node_visitor_manager import register_node_visitor
@@ -50,11 +50,10 @@ def define_node(
         )
         stack_output_tensors = [output_tensor_wrapper]
 
+        # Don't need to check axis_order since stack is a pytorch layout op according to layout transform.
         dim = 0 if len(node.args) == 1 else cast(int, node.args[1])
         if dim < 0:
             dim = dim % len(output_tensor.shape)
-        if QCOM_AXIS_ORDER in node.meta:
-            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
         stack_op = PyQnnManager.PyQnnOpWrapper(
             node.name,
             QNN_OP_PACKAGE_NAME_QTI_AISW,
diff --git a/backends/qualcomm/builders/op_unbind.py b/backends/qualcomm/builders/op_unbind.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
 from .node_visitor import NodeVisitor
 from .node_visitor_manager import register_node_visitor
@@ -52,11 +52,10 @@ def define_node(
             )
             unbind_output_tensors.append(output_tensor_wrapper)
 
+        # Don't need to check axis_order since unbind is a pytorch layout op according to layout transform.
         dim = 0 if len(node.args) == 1 else cast(int, node.args[1])
         if dim < 0:
             dim = dim % len(input_tensor.shape)
-        if QCOM_AXIS_ORDER in node.meta:
-            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
         unbind_op = PyQnnManager.PyQnnOpWrapper(
             node.name,
             QNN_OP_PACKAGE_NAME_QTI_AISW,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -400,9 +400,9 @@ def annotate_abs(node: Node, quantization_config: QuantizationConfig) -> None:
 
 @register_annotator(
     [
-        torch.torch.ops.aten.arange.default,
-        torch.torch.ops.aten.arange.start,
-        torch.torch.ops.aten.arange.start_step,
+        torch.ops.aten.arange.default,
+        torch.ops.aten.arange.start,
+        torch.ops.aten.arange.start_step,
     ]
 )
 def annotate_arange(node: Node, quantization_config: QuantizationConfig) -> None:
@@ -586,13 +586,6 @@ def annotate_hardswish(node: Node, quantization_config: QuantizationConfig) -> N
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator(
-    [torch.ops.aten.hardsigmoid.default, torch.ops.aten.hardsigmoid_.default]
-)
-def annotate_hardsigmoid(node: Node, quantization_config: QuantizationConfig) -> None:
-    annotate_single_in_single_out(node, quantization_config)
-
-
 @register_annotator([torch.ops.aten.hardtanh.default, torch.ops.aten.hardtanh_.default])
 def annotate_hardtanh(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -871,7 +864,14 @@ def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.sigmoid, torch.ops.aten.sigmoid.default])
+@register_annotator(
+    [
+        torch.ops.aten.hardsigmoid.default,
+        torch.ops.aten.hardsigmoid_.default,
+        torch.ops.aten.sigmoid,
+        torch.ops.aten.sigmoid.default,
+    ]
+)
 def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
@@ -896,7 +896,7 @@ def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> Non
 
     scale = 1 / (q_max - q_min + 1)
 
-    bias_obs_ctr = observer = FixedQParamsObserver.with_args(
+    output_obs_ctr = observer = FixedQParamsObserver.with_args(
         scale=scale,
         zero_point=0,
         dtype=quantization_config.output_activation.dtype,
@@ -908,7 +908,7 @@ def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> Non
         get_8a8w_qnn_qat_config(),
         get_16a4w_qnn_qat_config(),
     ):
-        bias_obs_ctr = FixedQParamsFakeQuantize.with_args(
+        output_obs_ctr = FixedQParamsFakeQuantize.with_args(
             observer=observer,
             scale=scale,
             zero_point=0,
@@ -923,7 +923,7 @@ def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> Non
         dtype=quantization_config.output_activation.dtype,
         quant_max=q_max,
         quant_min=q_min,
-        observer_or_fake_quant_ctr=bias_obs_ctr,
+        observer_or_fake_quant_ctr=output_obs_ctr,
         qscheme=torch.torch.per_tensor_affine,
     )
 
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -598,6 +598,48 @@ def forward(self, x, y):
         return z
 
 
+class Conv2dDownUpSample(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=bias,
+        )
+        self.conv_transpose = torch.nn.ConvTranspose2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(self.conv(x))
+
+
+class Conv2dFlip(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.dims = [1, 3]
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.flip(x, self.dims)
+
+
 class Conv2dMaxPool2d(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -660,46 +702,14 @@ def forward(self, x):
         return self.conv(x)
 
 
-class Conv2dDownUpSample(torch.nn.Module):
-    def __init__(self, bias=True):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=16,
-            out_channels=16,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=bias,
-        )
-        self.conv_transpose = torch.nn.ConvTranspose2d(
-            in_channels=16,
-            out_channels=16,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        return self.conv_transpose(self.conv(x))
-
-
-class Conv2dFlip(torch.nn.Module):
+class Conv2dStack(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=16,
-            out_channels=16,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False,
-        )
-        self.dims = [1, 3]
+        self.conv1 = torch.nn.Conv2d(3, 3, 3)
 
-    def forward(self, x):
-        x = self.conv(x)
-        return torch.flip(x, self.dims)
+    def forward(self, x, y, z):
+        x1 = self.conv1(x)
+        return torch.stack((x1, y, z))
 
 
 class Conv2dSliceCopy(torch.nn.Module):
@@ -744,6 +754,16 @@ def forward(self, x):
         return topk_values
 
 
+class Conv2dUnbind(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(3, 3, 3)
+
+    def forward(self, x):
+        x1 = self.conv1(x)
+        return torch.unbind(x1, dim=1)
+
+
 class Conv3dSequential(torch.nn.Module):
     def __init__(self, bias=True):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1982,6 +1982,15 @@ def test_qnn_backend_conv2d_slice_copy(self):
         sample_input = (torch.randn([2, 1, 3, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_stack(self):
+        module = Conv2dStack()  # noqa: F405
+        sample_input = (
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 3, 3),
+            torch.randn(1, 3, 3, 3),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d_sum_reduce_dim(self):
         module = Conv2dSumReduceDim()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
@@ -1992,6 +2001,14 @@ def test_qnn_backend_conv2d_topk(self):
         sample_input = (torch.randn(1, 3, 32, 32),)
         self.lower_module_and_test_output(module, sample_input)
 
+    # This test is to ensure unbind should be pytorch layout.
+    # However, unbind will be forced decomposed by executorch framework.
+    # Keep it here in case unbind doesn't get forced decomposed in future.
+    def test_qnn_backend_conv2d_unbind(self):
+        module = Conv2dUnbind()  # noqa: F405
+        sample_input = (torch.randn(1, 3, 5, 5),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_copy(self):
         sample_inputs = [
             (torch.randn(3, 4, 5),),
@@ -4365,6 +4382,16 @@ def test_qnn_backend_conv2d_slice_copy(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_stack(self):
+        module = Conv2dStack()  # noqa: F405
+        sample_input = (
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 3, 3),
+            torch.randn(1, 3, 3, 3),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d_sum_reduce_dim(self):
         module = Conv2dSumReduceDim()  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
@@ -4377,6 +4404,15 @@ def test_qnn_backend_conv2d_topk(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    # This test is to ensure unbind should be pytorch layout.
+    # However, unbind will be forced decomposed by executorch framework.
+    # Keep it here in case unbind doesn't get forced decomposed in future.
+    def test_qnn_backend_conv2d_unbind(self):
+        module = Conv2dUnbind()  # noqa: F405
+        sample_input = (torch.randn(1, 3, 5, 5),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_copy(self):
         sample_inputs = [
             (torch.randn(3, 4, 5),),
@@ -7757,7 +7793,7 @@ def test_mobilenet_v3(self):
                 metric = {
                     # GPU has accuracy issue now
                     QnnExecuTorchBackendType.kGpuBackend: {"top_1": 0, "top_5": 0},
-                    QnnExecuTorchBackendType.kHtpBackend: {"top_1": 55, "top_5": 81},
+                    QnnExecuTorchBackendType.kHtpBackend: {"top_1": 51, "top_5": 76},
                 }
                 self.assertGreaterEqual(
                     msg["top_1"], metric[get_backend_type(self.backend)]["top_1"]
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -8,6 +8,7 @@
 import re
 import warnings
 from collections import defaultdict, OrderedDict
+from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
@@ -924,10 +925,24 @@ def preprocess_binary(ctx_bin, compiler_specs):
     return bundle_prog
 
 
-def draw_graph(title, path, graph_module: torch.fx.GraphModule):
+class DrawFormat(Enum):
+    SVG = 1
+    PYDOT = 2
+
+
+def draw_graph(title, path, graph_module: torch.fx.GraphModule, format=DrawFormat.SVG):
     graph = passes.graph_drawer.FxGraphDrawer(graph_module, title)
-    with open(f"{path}/{title}.svg", "wb") as f:
-        f.write(graph.get_dot_graph().create_svg())
+    warnings.warn(
+        "For large models such as LLM, it is strongly recommended to use PYDOT format.",
+        stacklevel=1,
+    )
+    if format == DrawFormat.SVG:
+        with open(f"{path}/{title}.svg", "wb") as f:
+            f.write(graph.get_dot_graph().create_svg())
+    elif format == DrawFormat.PYDOT:
+        graph.get_dot_graph().write_raw(f"{path}/{title}.dot")
+    else:
+        raise RuntimeError(f"Unknown format {format}.")
 
 
 def generate_gpu_compiler_spec(
diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py