Qualcomm AI Engine Direct - Adding QNN backend support for fill.scalar core ATen op (pytorch#19826)

qti-horodnic · web-flow · commit 4d698cbe3443 · 2026-06-04T11:53:36.000-07:00
### Summary
Added support for the `fill.scalar` op via a decomposition pass using
the `full` op and the identity:

```
fill(input, value) = full(input.shape, value)
```

### Test plan
```
python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_fill --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android

python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNFloatingPointOperator.test_qnn_backend_fill --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android
```
diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md
@@ -210,7 +210,7 @@ class DecomposeMyOp(ExportPass):
         return PassResult(graph_module, True)
 ```
 
-**Critical rules:** (1) handle both dialects via `EdgeOpOverload` check, (2) `copy_meta` on every new node, (3) lift scalars to tensors in edge dialect with `get_const_node`, (4) cache constants with `const_cache`, (5) for bool-output nodes use `callback=lambda m: {**m, "val": m["val"].to(torch.bool)}` in `create_node`.
+**Critical rules:** (1) handle both dialects via `EdgeOpOverload` check, (2) `copy_meta` on every new node, (3) lift scalars to tensors in edge dialect with `get_const_node`, (4) cache constants with `const_cache`, (5) for bool-output nodes use `callback=lambda m: {**m, "val": m["val"].to(torch.bool)}` in `create_node`, (6) **never pass kwargs** (like `dtype`/`device`) to `graph.create_node` for ATen ops — the ATen IR requires kwargs to be empty (`prepare_pt2e` asserts this); instead rely on `copy_meta` which propagates dtype/device via the FakeTensor in `node.meta["val"]`.
 
 ### Approach C: Built-in Decomposition Table
 **Ref:** `_passes/decompose_triu.py`. Uses `make_fx` + `get_decompositions`. Only works if PyTorch has a registered decomp.
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -21,6 +21,7 @@
 from .decompose_col_im import DecomposeColIm
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
+from .decompose_fill import DecomposeFill
 from .decompose_floor_divide import DecomposeFloorDivide
 from .decompose_glu import DecomposeGlu
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
@@ -80,6 +81,7 @@
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFill,
     DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
diff --git a/backends/qualcomm/_passes/decompose_fill.py b/backends/qualcomm/_passes/decompose_fill.py
@@ -0,0 +1,61 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+from .utils import copy_meta
+
+
+class DecomposeFill(ExportPass):
+    """
+    Decompose fill.Scalar into full.default.
+    fill(input, value) is semantically equivalent to full(input.shape, value).
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.targets = {
+            torch.ops.aten.fill.Scalar,
+            torch.ops.aten.fill_.Scalar,
+            exir_ops.edge.aten.fill.Scalar,
+            exir_ops.edge.aten.fill_.Scalar,
+        }
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        for node in list(graph.nodes):
+            if node.op == "call_function" and node.target in self.targets:
+                fill_node = node
+                is_edge = isinstance(node.target, EdgeOpOverload)
+                input_node = node.args[0]
+                scalar_value = node.args[1]
+
+                # Get the shape from the input tensor metadata
+                shape = list(input_node.meta["val"].shape)
+
+                full_op = (
+                    exir_ops.edge.aten.full.default
+                    if is_edge
+                    else torch.ops.aten.full.default
+                )
+
+                with graph.inserting_after(input_node):
+                    full_node = graph.create_node(
+                        "call_function",
+                        full_op,
+                        (shape, scalar_value),
+                    )
+                    full_node.meta = copy_meta(fill_node.meta)
+
+                for user in fill_node.users.copy():
+                    user.replace_input_with(fill_node, full_node)
+
+        dead_code_elimination_pass(graph_module)
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -26,6 +26,7 @@
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFill,
     DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
@@ -110,6 +111,7 @@ def get_capture_program_passes():
         (DecomposeAny, True),
         (DecomposeAtan2, True),
         (DecomposeColIm, True),
+        (DecomposeFill, True),
         (DecomposeLogVariants, True),
         (DecomposeMaxPool3d, True),
         (DecomposeMinMaxDim, True),
@@ -248,6 +250,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeFill())
         self.add_pass(DecomposeGlu())
         # HTP and GPU doesn't support ElementWiseUnary with operation=reciprocal
         # Decompose Reciprocal into Div for these 2 backend
@@ -275,6 +278,7 @@ def transform_for_export_pipeline(
         self.add_pass(DecomposeTriu())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeFill())
         # DecomposeFloorDivide does not apply to the annotation pipeline,
         # since the CPU QDQ model would reduce accuracy.
         # We keep div and floor operations in floating-point to maintain precision.
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
@@ -69,6 +69,7 @@ def get_passes_dependency_for_capture_program():
         DecomposeAny,
         DecomposeAtan2,
         DecomposeColIm,
+        DecomposeFill,
         DecomposeLinalgVectorNorm,
         DecomposeLogVariants,
         DecomposeMaxPool3d,
@@ -104,6 +105,7 @@ def get_passes_dependency_for_capture_program():
         DecomposeAny: [RemoveRedundancy],
         DecomposeAtan2: [RemoveRedundancy],
         DecomposeColIm: [FoldQDQ],
+        DecomposeFill: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
         DecomposeLogVariants: [RemoveRedundancy],
         DecomposeMaxPool3d: [RemoveRedundancy],
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
@@ -506,6 +506,7 @@ The following PyTorch operators are supported through decomposition or annotatio
 | `aten.im2col`, `aten.col2im` | `DecomposeColIm` |
 | `aten.einsum` | `DecomposeEinsum` |
 | `aten.special_expm1` | `DecomposeExpM1` |
+| `aten.fill.Scalar` | `DecomposeFill` |
 | `aten.floor_divide` | `DecomposeFloorDivide` |
 | `aten.glu` | `DecomposeGlu` |
 | `aten.linalg_vector_norm` | `DecomposeLinalgVectorNorm` |
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -1115,6 +1115,15 @@ def forward(self, x):
         return torch.special.expm1(x)
 
 
+class Fill(torch.nn.Module):
+    def __init__(self, value):
+        super().__init__()
+        self.value = value
+
+    def forward(self, x):
+        return torch.add(x, torch.fill(x, self.value))
+
+
 class Flip(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -965,6 +965,11 @@ def test_qnn_backend_fp16a8w_fp16_simple_model(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_fill(self):
+        module = Fill(3.14)  # noqa: F405
+        sample_input = (torch.randn(1, 2, 3, 4),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_flip(self):
         sample_input = (torch.randn(3, 4, 5, 6),)
         module = Flip()  # noqa: F405
@@ -3586,6 +3591,12 @@ def test_qnn_backend_expm1(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_fill(self):
+        module = Fill(3.14)  # noqa: F405
+        sample_input = (torch.randn(1, 2, 3, 4),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_flip(self):
         sample_input = (torch.randn(3, 4, 5, 6),)
         module = Flip()  # noqa: F405