pytorch
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 19 additions & 12 deletions b/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 2 additions & 1 deletion b/‎CMakePresets.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 11 additions & 1 deletion b/‎Makefile‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 29 additions & 0 deletions b/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 6 additions & 6 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/arm/_passes/arm_pass_utils.py‎
Lines changed: 57 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_utils.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_add_sub_alpha_pass.py‎
Lines changed: 2 additions & 13 deletions b/‎backends/arm/_passes/decompose_add_sub_alpha_pass.py‎
Lines changed: 2 additions & 13 deletions
diff --git a/‎backends/arm/_passes/decompose_asin_and_acos_pass.py‎
Lines changed: 5 additions & 14 deletions b/‎backends/arm/_passes/decompose_asin_and_acos_pass.py‎
Lines changed: 5 additions & 14 deletions
@@ -9,34 +9,41 @@ set -euxo pipefail
 
 # This script is run before building ExecuTorch binaries
 
-if [[ "$(uname -m)" == "aarch64" ]]; then
-  # On some Linux aarch64 systems, the "atomic" library is not found during linking.
-  # To work around this, replace "atomic" with the literal ${ATOMIC_LIB} so the
-  # build system uses the full path to the atomic library.
-  file="extension/llm/tokenizers/third-party/sentencepiece/src/CMakeLists.txt"
-  sed 's/list(APPEND SPM_LIBS "atomic")/list(APPEND SPM_LIBS ${ATOMIC_LIB})/' \
-    "$file" > "${file}.tmp" && mv "${file}.tmp" "$file"
-
-  grep -n 'list(APPEND SPM_LIBS ${ATOMIC_LIB})' "$file" && \
-    echo "the file $file has been modified for atomic to use full path"
+# Initialize submodules here instead of during checkout so we can use OpenSSL
+# on Windows (schannel fails with SEC_E_ILLEGAL_MESSAGE on some gitlab hosts).
+UNAME_S=$(uname -s)
+if [[ $UNAME_S == *"MINGW"* || $UNAME_S == *"MSYS"* ]]; then
+  git -c http.sslBackend=openssl submodule update --init
+else
+  git submodule update --init
 fi
 
 # Clone nested submodules for tokenizers - this is a workaround for recursive
 # submodule clone failing due to path length limitations on Windows. Eventually,
 # we should update the core job in test-infra to enable long paths before
 # checkout to avoid needing to do this.
 pushd extension/llm/tokenizers
-UNAME_S=$(uname -s)
 if [[ $UNAME_S == *"MINGW"* || $UNAME_S == *"MSYS"* ]]; then
   git -c http.sslBackend=openssl submodule update --init
 else
   git submodule update --init
 fi
 popd
 
+if [[ "$(uname -m)" == "aarch64" ]]; then
+  # On some Linux aarch64 systems, the "atomic" library is not found during linking.
+  # To work around this, replace "atomic" with the literal ${ATOMIC_LIB} so the
+  # build system uses the full path to the atomic library.
+  file="extension/llm/tokenizers/third-party/sentencepiece/src/CMakeLists.txt"
+  sed 's/list(APPEND SPM_LIBS "atomic")/list(APPEND SPM_LIBS ${ATOMIC_LIB})/' \
+    "$file" > "${file}.tmp" && mv "${file}.tmp" "$file"
+
+  grep -n 'list(APPEND SPM_LIBS ${ATOMIC_LIB})' "$file" && \
+    echo "the file $file has been modified for atomic to use full path"
+fi
+
 # On Windows, enable symlinks and re-checkout the current revision to create
 # the symlinked src/ directory. This is needed to build the wheel.
-UNAME_S=$(uname -s)
 if [[ $UNAME_S == *"MINGW"* || $UNAME_S == *"MSYS"* ]]; then
     echo "Enabling symlinks on Windows"
     git config core.symlinks true
 
@@ -64,4 +64,6 @@ jobs:
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
       wheel-build-params: "--verbose"
-      submodules: true
+      # Submodules are initialized in pre_build_script.sh with OpenSSL to avoid
+      # schannel SSL errors on Windows when cloning from non-GitHub hosts.
+      submodules: false
@@ -135,6 +135,9 @@ jobs:
         # Run CUDA backend Python tests
         python -m pytest backends/cuda/tests backends/cuda/passes/tests -v -o "addopts="
 
+        # Build Qwen3.5 MoE runner (ExecuTorch already built above)
+        cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
+
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
 
@@ -152,7 +152,8 @@
         "llm-release"
       ],
       "cacheVariables": {
-        "EXECUTORCH_BUILD_CUDA": "ON"
+        "EXECUTORCH_BUILD_CUDA": "ON",
+        "CMAKE_CUDA_ARCHITECTURES": "native"
       },
       "condition": {
         "type": "inList",
 
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -121,6 +121,7 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
+	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -362,6 +363,15 @@ gemma3-cpu:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/gemma3/gemma3_e2e_runner"
 
+qwen3_5_moe-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Qwen3.5 MoE runner with CUDA..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
 
@@ -11,6 +11,7 @@
 
 from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
@@ -124,3 +125,31 @@ def call_shape_operator(
         shape_meta.data[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.SHAPE
         # Call the super (ArmPass) call operator with updated meta
         return self.call_operator(op, args, kwargs, shape_meta, updated)
+
+    def call_scalar(self, value: int | float, meta: NodeMetadata | dict[str, Any]):
+        """Return a scalar value for the current pass stage.
+
+        In transform-for-annotation passes this returns the Python scalar
+        directly. In later passes it materializes a `(1,)` `aten.full` node
+        using the output dtype/device from `meta["val"]` when available.
+
+        """
+
+        if self.is_tfa_pass:
+            return value
+
+        kwargs = {}
+        if "val" in meta:
+            val = meta["val"]
+            if isinstance(val, tuple):
+                val = val[0]
+            kwargs = {"device": val.device, "dtype": val.dtype}
+
+        return ArmPass.call_operator(
+            self,
+            op=exir_ops.edge.aten.full.default,
+            args=((1,), value),
+            kwargs=kwargs,
+            meta=meta,
+            updated=True,
+        )
@@ -557,6 +557,12 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                 DecomposeDivTensorModePass(tfa_pass=True),
                 DecomposeWhereScalarOtherPass(tfa_pass=True),
                 RewriteInplaceArithmeticPass(tfa_pass=True),
+                DecomposeAddSubAlphaPass(tfa_pass=True),
+                DecomposeLeakyReLUPass(tfa_pass=True),
+                DecomposeGroupNormPass(tfa_pass=True),
+                DecomposeLayerNormPass(tfa_pass=True),
+                DecomposeVarPass(tfa_pass=True),
+                DecomposeMeanDimPass(graph_module, self.tosa_spec, tfa_pass=True),
             ]
         )
 
@@ -573,16 +579,10 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_passes(
             [
                 NormalizeWhileInitialArgsPass(use_exir_clone=False, tfa_pass=True),
-                DecomposeAddSubAlphaPass(tfa_pass=True),
-                DecomposeGroupNormPass(tfa_pass=True),
-                DecomposeLayerNormPass(tfa_pass=True),
-                DecomposeVarPass(tfa_pass=True),
-                DecomposeMeanDimPass(graph_module, self.tosa_spec, tfa_pass=True),
                 DecomposeNotEqualPass(tfa_pass=True),
                 DecomposeCosineSimilarityPass(tfa_pass=True),
                 DecomposeGluPass(tfa_pass=True),
                 DecomposeDivPass(tfa_pass=True),
-                DecomposeLeakyReLUPass(tfa_pass=True),
                 DecomposeLinalgVectorNormPass(tfa_pass=True),
                 DecomposeSqrtPass(tfa_pass=True),
                 DecomposeAdaptiveAvgPool2dPass(tfa_pass=True),
 
@@ -14,6 +14,7 @@
 import torch.fx
 from executorch.backends.arm.common.debug import get_node_debug_info
 from executorch.backends.arm.common.type import ensure_type
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -172,6 +173,30 @@ def create_node(
     return node
 
 
+def create_shape_node(
+    graph: torch.fx.Graph,
+    op_target: EdgeOpOverload,
+    args: tuple = (),
+    kwargs: Optional[dict] = None,
+    from_node: Optional[torch.fx.Node] = None,
+):
+    """Adds a shape node to 'graph'.
+
+    graph.inserting_before/after() should be used before the call to decide
+    where to insert the node.
+
+    """
+    node = create_node(
+        graph=graph,
+        op_target=op_target,
+        args=args,
+        kwargs=kwargs,
+        from_node=from_node,
+    )
+    node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.SHAPE
+    return node
+
+
 def insert_q_dq_pair(
     graph: torch.fx.Graph,
     anchor: torch.fx.Node,
@@ -211,6 +236,38 @@ def meta_without_qparams(meta: NodeMetadata) -> NodeMetadata:
     return NodeMetadata(plain_meta_dict)
 
 
+def insert_scalar(
+    graph: torch.fx.Graph,
+    value: int | float,
+    meta: NodeMetadata | dict,
+    from_node: torch.fx.Node,
+    is_tfa_pass: bool = False,
+) -> torch.fx.Node | int | float:
+    """Insert an `aten.full` scalar node for direct graph-rewrite passes."""
+
+    if is_tfa_pass:
+        return value
+
+    kwargs = {}
+    val = None
+    if "val" in meta:
+        val = meta["val"]
+        if isinstance(val, tuple):
+            val = val[0]
+        kwargs = {"device": val.device, "dtype": val.dtype}
+
+    scalar = create_node(
+        graph=graph,
+        op_target=exir_ops.edge.aten.full.default,
+        args=((1,), value),
+        kwargs=kwargs,
+        from_node=from_node,
+    )
+    if val is not None:
+        scalar.meta["val"] = torch.full((1,), value, **kwargs)
+    return scalar
+
+
 def get_first_fake_tensor(node: torch.fx.Node) -> FakeTensor:
     """Returns a FakeTensor from the meta field of 'node'.
 
 
@@ -30,24 +30,20 @@ def _get_ops(op):
         if op is exir_ops.edge.aten.add.Tensor:
             return (
                 exir_ops.edge.aten.mul.Tensor,
-                exir_ops.edge.aten.full.default,
                 exir_ops.edge.aten.add.Tensor,
             )
         return (
             torch.ops.aten.mul.Tensor,
-            torch.ops.aten.full.default,
             torch.ops.aten.add.Tensor,
         )
     if op in _SUB_OPS:
         if op is exir_ops.edge.aten.sub.Tensor:
             return (
                 exir_ops.edge.aten.mul.Tensor,
-                exir_ops.edge.aten.full.default,
                 exir_ops.edge.aten.sub.Tensor,
             )
         return (
             torch.ops.aten.mul.Tensor,
-            torch.ops.aten.full.default,
             torch.ops.aten.sub.Tensor,
         )
     raise RuntimeError(f"Unsupported operator {op}")
@@ -72,19 +68,12 @@ def call_operator(self, op, args, kwargs, meta, updated: bool | None = False):
         if not _should_decompose(alpha):
             return super().call_operator(op, args, kwargs, meta, updated)
 
-        mul_op, full_op, binary_op = _get_ops(op)
+        mul_op, binary_op = _get_ops(op)
         lhs, rhs = args
 
-        alpha_full = super().call_operator(
-            full_op,
-            ((1,), float(alpha)),
-            {"device": meta["val"].device, "dtype": meta["val"].dtype},
-            meta,
-            updated=True,
-        )
         scaled_rhs = super().call_operator(
             mul_op,
-            (rhs, alpha_full),
+            (rhs, super().call_scalar(alpha, meta)),
             {},
             meta,
             updated=True,
 
@@ -42,7 +42,6 @@ def get_decomposition(op) -> tuple:
             exir_ops.edge.aten.gt.Scalar,
             exir_ops.edge.aten.lt.Scalar,
             exir_ops.edge.aten.sub.Tensor,
-            exir_ops.edge.aten.full_like.default,
             exir_ops.edge.aten.neg.default,
         )
 
@@ -79,15 +78,12 @@ def _build_polynomial(
         """Helper function to build polynomial from coefficients and
         variable.
         """
-        full_like_op, add_op, mul_op_scalar, mul_op = (
-            exir_ops.edge.aten.full_like.default,
+        add_op, mul_op_scalar, mul_op = (
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.mul.Scalar,
             exir_ops.edge.aten.mul.Tensor,
         )
-        result = super().call_operator(
-            full_like_op, (variable, coefficients[0]), {}, meta, True
-        )
+        result = super().call_scalar(coefficients[0], meta)
         for coeff in coefficients[1:]:
             result = super().call_operator(
                 add_op,
@@ -150,7 +146,6 @@ def call_operator(self, op, args, kwargs, meta):
             gt_op,
             lt_op,
             sub_op,
-            full_like_op,
             neg_op,
         ) = get_decomposition(op)
 
@@ -179,7 +174,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Step 2: Compute the transformed approximation for large values
         # Calculate z = -0.5 * (|x| - 1)
-        tmp_ones = super().call_operator(full_like_op, (x_abs, one), {}, meta, True)
+        tmp_ones = super().call_scalar(one, meta)
         tmp = super().call_operator(sub_op, (x_abs, tmp_ones), {}, meta, True)
         z = super().call_operator(mul_op_scalar, (tmp, neg_half), {}, meta, True)
 
@@ -201,9 +196,7 @@ def call_operator(self, op, args, kwargs, meta):
         t2 = super().call_operator(mul_op_scalar, (t1, two), {}, meta, True)
 
         diff = super().call_operator(sub_op_scalar, (t2, pi_over_2), {}, meta, True)
-        tmp_neg_ones = super().call_operator(
-            full_like_op, (diff, neg_one), {}, meta, True
-        )
+        tmp_neg_ones = super().call_scalar(neg_one, meta)
         asin_large = super().call_operator(mul_op, (diff, tmp_neg_ones), {}, meta, True)
 
         asin_unsigned = self._combine_branches(
@@ -218,9 +211,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         if op in edge_acos_op:
             # If x <= 0.5: acos(x) = pi/2 - asin(x)
-            const_tensor = super().call_operator(
-                full_like_op, (x, pi_over_2), {}, meta, True
-            )
+            const_tensor = super().call_scalar(pi_over_2, meta)
             acos_small = super().call_operator(
                 sub_op, (const_tensor, asin), {}, meta, True
             )