Qualcomm AI Engine Direct - Adding QNN backend support for _cdist_forward core ATen op (pytorch#20195)

qti-horodnic · web-flow · commit 129c687d3abc · 2026-06-10T19:35:19.000-04:00
### Summary Added support for the `_cdist_forward` core ATen op using the existing implementation for `CDist`. Note this is an internal ATen variant of `torch.cdist` that `torch.export` produces, so just added the target to the existing pass for `CDist` and other small additions to make sure the pass is registered in the correct pipelines. ### Test plan ``` python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNFloatingPointOperator.test_qnn_backend_cdist_forward --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_cdist_forward --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android ``` cc @cccclai @cbilgin @abhinaykukkadapu
diff --git a/backends/qualcomm/_passes/decompose_cdist.py b/backends/qualcomm/_passes/decompose_cdist.py
@@ -36,14 +36,19 @@ class DecomposeCDist(ExportPass):
     Decompose for math equivalent op.
     """
 
+    cdist_targets = {
+        torch.ops.aten.cdist.default,
+        torch.ops.aten._cdist_forward.default,
+    }
+
     def __init__(self) -> None:
         super().__init__()
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         graph = graph_module.graph
         for node in graph.nodes:
             model = CDist()
-            if torch.ops.aten.cdist.default == node.target:
+            if node.target in self.cdist_targets:
                 if len(node.args) > 2:
                     assert (
                         node.args[2] == 2
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -126,6 +126,7 @@ def get_default_pass_activations(cls):
             (DecomposeAny, True),
             (DecomposeAtan2, True),
             (DecomposeColIm, True),
+            (DecomposeCDist, True),
             (DecomposeFill, True),
             (DecomposeLogVariants, True),
             (DecomposeMaxPool3d, True),
@@ -278,6 +279,7 @@ def get_passes_dependency_for_capture_program(cls):
             DecomposeAny: [RemoveRedundancy],
             DecomposeAtan2: [RemoveRedundancy],
             DecomposeColIm: [FoldQDQ],
+            DecomposeCDist: [RemoveRedundancy],
             DecomposeFill: [RemoveRedundancy],
             DecomposeLinalgVectorNorm: [RemoveRedundancy],
             DecomposeLogVariants: [RemoveRedundancy],
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
@@ -502,7 +502,7 @@ The following PyTorch operators are supported through decomposition or annotatio
 | `aten.any` | `DecomposeAny` |
 | `aten.atan2.default`, `aten.atan2.out` | `DecomposeAtan2` |
 | `aten.add` (with alpha), `aten.sub` (with alpha) | `DecomposeBinaryAlpha` |
-| `aten.cdist` | `DecomposeCDist` |
+| `aten.cdist`, `aten._cdist_forward` | `DecomposeCDist` |
 | `aten.im2col`, `aten.col2im` | `DecomposeColIm` |
 | `aten.einsum` | `DecomposeEinsum` |
 | `aten.special_expm1` | `DecomposeExpM1` |
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -408,6 +408,14 @@ def forward(self, x, y):
         return torch.cdist(x, y, p=2)
 
 
+class CDistForward(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.ops.aten._cdist_forward.default(x, y, 2.0, None)
+
+
 class Ceil(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -428,6 +428,14 @@ def test_qnn_backend_cdist(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cdist_forward(self):
+        module = CDistForward()  # noqa: F405
+        sample_input = (
+            torch.randn(1, 125, 256),
+            torch.randn(1, 2048, 256),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_channel_shuffle(self):
         module = ChannelShuffle(2)  # noqa: F405
         sample_input = (torch.randn(1, 4, 3, 3),)
@@ -3159,6 +3167,15 @@ def test_qnn_backend_cdist(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cdist_forward(self):
+        module = CDistForward()  # noqa: F405
+        sample_input = (
+            torch.randn(1, 125, 256),
+            torch.randn(1, 2048, 256),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_channel_shuffle(self):
         module = ChannelShuffle(2)  # noqa: F405
         sample_input = (torch.randn(1, 4, 3, 3),)