pytorch
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 27 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎.github/workflows/build-wheels-aarch64-linux.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-aarch64-linux.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/build-wheels-macos.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-macos.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/aoti/aoti_backend.py‎
Lines changed: 34 additions & 4 deletions b/‎backends/aoti/aoti_backend.py‎
Lines changed: 34 additions & 4 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 5 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎backends/arm/_passes/annotate_output_dim_order_pass.py‎
Lines changed: 0 additions & 28 deletions b/‎backends/arm/_passes/annotate_output_dim_order_pass.py‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 4 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/arm/_passes/arm_pass_utils.py‎
Lines changed: 0 additions & 5 deletions b/‎backends/arm/_passes/arm_pass_utils.py‎
Lines changed: 0 additions & 5 deletions
@@ -415,14 +415,40 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
+  EXPORT_LOG=$(mktemp)
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.qwen3_5_moe.export \
       --prequantized "$LOCAL_MODEL_DIR" \
       --output-dir "${OUTPUT_DIR}" \
       --dense-prefill dequant \
-      --moe-activation-dtype int8
+      --moe-activation-dtype int8 2>&1 | tee "$EXPORT_LOG"
+  EXPORT_RC=${PIPESTATUS[0]}
   echo "::endgroup::"
 
+  if [ "$EXPORT_RC" -ne 0 ]; then
+    echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)"
+    rm -f "$EXPORT_LOG"
+    exit "$EXPORT_RC"
+  fi
+
+  # Gate peak GPU memory so we keep the export viable on consumer GPUs
+  # (e.g. RTX 4090 with 24 GB). The export script prints a machine-
+  # parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
+  EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}"
+  PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1)
+  rm -f "$EXPORT_LOG"
+  if [ -z "$PEAK_LINE" ]; then
+    echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
+    exit 1
+  fi
+  PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}')
+  echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
+  if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then
+    echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
+    echo "       — this would prevent the model from being exported on a 24 GB consumer GPU."
+    exit 1
+  fi
+
   test -f "${OUTPUT_DIR}/model.pte"
   test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
   ls -al "${OUTPUT_DIR}"
 
@@ -9,8 +9,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -9,8 +9,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -9,8 +9,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -8,8 +8,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -9,7 +9,7 @@
 import typing
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Any, Dict, List, Set
+from typing import Any, Dict, List, Optional, Set
 
 import torch
 from executorch.backends.aoti.passes.replace_view_copy_with_view import (
@@ -88,8 +88,14 @@ def save_data_externally(cls) -> bool:
         return False
 
     @classmethod
-    def get_extra_aoti_compile_context_manager(cls):
-        """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager."""
+    def get_extra_aoti_compile_context_manager(
+        cls, compile_specs: Optional[List[CompileSpec]] = None
+    ):
+        """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager.
+
+        Subclasses may inspect ``compile_specs`` to opt into behaviors that
+        only apply to specific methods/models (e.g. low-memory export).
+        """
         return contextlib.nullcontext()
 
     @classmethod
@@ -105,6 +111,24 @@ def codesign_so(cls, so_path: str, compile_specs: List[CompileSpec]) -> None:
         """
         return
 
+    @classmethod
+    def release_moved_tensors(
+        cls,
+        device_edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> None:
+        """Release device memory held by tensors that ``move_to_device_pass``
+        placed on the target device.
+
+        Called at the end of ``preprocess`` so that the next ``preprocess``
+        call (e.g. for the next method in a multi-method export) can reuse
+        the freed memory. Override in concrete backends (e.g. ``CudaBackend``)
+        to actually free device memory.
+
+        Default: no-op.
+        """
+        return
+
     @classmethod
     @contextlib.contextmanager
     def collect_unsupported_fallback_kernels(cls, missing_fallback_kernels: Set[str]):
@@ -208,7 +232,7 @@ def preprocess(
         # Compile with fallback kernel collection
         with cls.collect_unsupported_fallback_kernels(
             missing_fallback_kernels
-        ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager():
+        ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager(compile_specs):
             paths = torch._inductor.aot_compile(
                 edge_program_module, tuple(user_input_placeholders), options=options
             )
@@ -269,6 +293,12 @@ def preprocess(
         os.remove(so_path)
         os.remove(blob_path)
 
+        # Release device memory held by tensors that ``move_to_device_pass``
+        # placed on the target device. Default impl is a no-op; concrete
+        # backends (e.g. CudaBackend) override this to free GPU memory before
+        # the next preprocess call (e.g. for the next method).
+        cls.release_moved_tensors(device_edge_program, compile_specs)
+
         return PreprocessResult(
             processed_bytes=b"",
             debug_handle_map={},
 
@@ -7,7 +7,6 @@
 from . import arm_pass_utils  # noqa
 from .arm_pass import ArmPass  # noqa  # usort: skip
 from .accumulate_index_put_pass import AccumulateIndexPutPass  # noqa
-from .annotate_output_dim_order_pass import AnnotateOutputDimOrderPass  # noqa
 from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .canonicalize_gather_pass import CanonicalizeGatherPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
@@ -61,9 +60,6 @@
 from .decompose_index_tensor_to_gather_pass import (  # noqa
     DecomposeIndexTensorToGatherPass,
 )
-from .decompose_int16_activation_conv_pass import (  # noqa
-    DecomposeConvWithInt16ActivationPass,
-)
 from .decompose_int_pow_pass import DecomposeIntPowPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
@@ -77,6 +73,7 @@
 from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
+from .decompose_permute_for_u55_pass import DecomposePermuteForU55Pass  # noqa
 from .decompose_quant_nodes import DecomposeQuantNodesPass  # noqa
 from .decompose_remainder_pass import DecomposeRemainderPass  # noqa
 from .decompose_rnn_pass import DecomposeRnnPass  # noqa
@@ -167,7 +164,6 @@
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
-from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
 from .replace_inf_and_limit_values_pass import (  # noqa  # usort: skip
 
@@ -75,6 +75,7 @@
     DecomposeMaxPool2dPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
+    DecomposePermuteForU55Pass,
     DecomposeQuantNodesPass,
     DecomposeRemainderPass,
     DecomposeRnnPass,
@@ -438,6 +439,7 @@ def _tosa_pipeline(
                 ConvertSplitToSlicePass(),
                 QuantizeClampArgumentsPass(),
                 RemoveGetItemPass(),
+                FuseBatchNorm2dPass(exported_program),
                 DecomposeBatchNormNoStatsPass(),
                 DecomposeLogitPass(),
                 DecomposeMaskedFillPass(),
@@ -501,7 +503,6 @@ def _tosa_pipeline(
                 RewriteBoolBitwiseToLogicalPass(),
                 DecomposeRemainderPass(),
                 DecomposeDivTensorModePass(),
-                FuseBatchNorm2dPass(exported_program),
                 ConvertMmToBmmPass(),
                 DecomposeGluPass(),
                 DecomposeDivPass(),
@@ -536,13 +537,14 @@ def _tosa_pipeline(
                 RewriteConvPass(exported_program),
                 RewriteMatmulPass(),
                 RewritePadPass(),
-                RewriteSlicePass(),
                 FuseViewCopyTransformPass(),
                 RemovePermutesAroundElementwiseOps(),
                 PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView(),
                 FuseCascadedTransposeOrPermuteOps(),
                 ConvertPermuteSingletonToViewPass(),
                 RewriteHighRankSingletonPermutePass(),
+                DecomposePermuteForU55Pass(),
+                RewriteSlicePass(),
                 InsertConstShapesPass(),
             ]
         )
 
@@ -364,11 +364,6 @@ def set_node_arg(node: torch.fx.Node, i: int | str, value):
         raise RuntimeError("Invalid type")
 
 
-def get_output_dim_orders(graph_module):
-    output_node = graph_module.graph.output_node()
-    return [get_first_fake_tensor(node).dim_order() for node in output_node.args[0]]
-
-
 def is_nested_control_flow_graph(graph_module: GraphModule) -> bool:
     """Returns True if graph_module is a nested control-flow graph."""