Skip to content

Commit 9adb92f

Browse files
authored
Merge branch 'main' into android-combined-v2
2 parents bd3ae1c + 8a397b4 commit 9adb92f

58 files changed

Lines changed: 869 additions & 2084 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/export_model_artifact.sh

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,14 +415,40 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
415415

416416
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417417
echo "::group::Export"
418+
EXPORT_LOG=$(mktemp)
418419
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419420
python -m executorch.examples.models.qwen3_5_moe.export \
420421
--prequantized "$LOCAL_MODEL_DIR" \
421422
--output-dir "${OUTPUT_DIR}" \
422423
--dense-prefill dequant \
423-
--moe-activation-dtype int8
424+
--moe-activation-dtype int8 2>&1 | tee "$EXPORT_LOG"
425+
EXPORT_RC=${PIPESTATUS[0]}
424426
echo "::endgroup::"
425427

428+
if [ "$EXPORT_RC" -ne 0 ]; then
429+
echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)"
430+
rm -f "$EXPORT_LOG"
431+
exit "$EXPORT_RC"
432+
fi
433+
434+
# Gate peak GPU memory so we keep the export viable on consumer GPUs
435+
# (e.g. RTX 4090 with 24 GB). The export script prints a machine-
436+
# parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
437+
EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}"
438+
PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1)
439+
rm -f "$EXPORT_LOG"
440+
if [ -z "$PEAK_LINE" ]; then
441+
echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
442+
exit 1
443+
fi
444+
PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}')
445+
echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
446+
if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then
447+
echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
448+
echo " — this would prevent the model from being exported on a 24 GB consumer GPU."
449+
exit 1
450+
fi
451+
426452
test -f "${OUTPUT_DIR}/model.pte"
427453
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
428454
ls -al "${OUTPUT_DIR}"

.github/workflows/build-wheels-aarch64-linux.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ on:
99
- examples/**/*
1010
- pyproject.toml
1111
- setup.py
12-
tags:
13-
- ciflow/binaries/*
1412
push:
1513
branches:
1614
- nightly

.github/workflows/build-wheels-linux.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ on:
99
- examples/**/*
1010
- pyproject.toml
1111
- setup.py
12-
tags:
13-
- ciflow/binaries/*
1412
push:
1513
branches:
1614
- nightly

.github/workflows/build-wheels-macos.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ on:
99
- examples/**/*
1010
- pyproject.toml
1111
- setup.py
12-
tags:
13-
- ciflow/binaries/*
1412
push:
1513
branches:
1614
- nightly

.github/workflows/build-wheels-windows.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ on:
88
- examples/**/*
99
- pyproject.toml
1010
- setup.py
11-
tags:
12-
- ciflow/binaries/*
1311
push:
1412
branches:
1513
- nightly

backends/aoti/aoti_backend.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import typing
1010
from abc import ABC, abstractmethod
1111
from enum import Enum
12-
from typing import Any, Dict, List, Set
12+
from typing import Any, Dict, List, Optional, Set
1313

1414
import torch
1515
from executorch.backends.aoti.passes.replace_view_copy_with_view import (
@@ -88,8 +88,14 @@ def save_data_externally(cls) -> bool:
8888
return False
8989

9090
@classmethod
91-
def get_extra_aoti_compile_context_manager(cls):
92-
"""Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager."""
91+
def get_extra_aoti_compile_context_manager(
92+
cls, compile_specs: Optional[List[CompileSpec]] = None
93+
):
94+
"""Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager.
95+
96+
Subclasses may inspect ``compile_specs`` to opt into behaviors that
97+
only apply to specific methods/models (e.g. low-memory export).
98+
"""
9399
return contextlib.nullcontext()
94100

95101
@classmethod
@@ -105,6 +111,24 @@ def codesign_so(cls, so_path: str, compile_specs: List[CompileSpec]) -> None:
105111
"""
106112
return
107113

114+
@classmethod
115+
def release_moved_tensors(
116+
cls,
117+
device_edge_program: ExportedProgram,
118+
compile_specs: List[CompileSpec],
119+
) -> None:
120+
"""Release device memory held by tensors that ``move_to_device_pass``
121+
placed on the target device.
122+
123+
Called at the end of ``preprocess`` so that the next ``preprocess``
124+
call (e.g. for the next method in a multi-method export) can reuse
125+
the freed memory. Override in concrete backends (e.g. ``CudaBackend``)
126+
to actually free device memory.
127+
128+
Default: no-op.
129+
"""
130+
return
131+
108132
@classmethod
109133
@contextlib.contextmanager
110134
def collect_unsupported_fallback_kernels(cls, missing_fallback_kernels: Set[str]):
@@ -208,7 +232,7 @@ def preprocess(
208232
# Compile with fallback kernel collection
209233
with cls.collect_unsupported_fallback_kernels(
210234
missing_fallback_kernels
211-
), torch.no_grad(), cls.get_extra_aoti_compile_context_manager():
235+
), torch.no_grad(), cls.get_extra_aoti_compile_context_manager(compile_specs):
212236
paths = torch._inductor.aot_compile(
213237
edge_program_module, tuple(user_input_placeholders), options=options
214238
)
@@ -269,6 +293,12 @@ def preprocess(
269293
os.remove(so_path)
270294
os.remove(blob_path)
271295

296+
# Release device memory held by tensors that ``move_to_device_pass``
297+
# placed on the target device. Default impl is a no-op; concrete
298+
# backends (e.g. CudaBackend) override this to free GPU memory before
299+
# the next preprocess call (e.g. for the next method).
300+
cls.release_moved_tensors(device_edge_program, compile_specs)
301+
272302
return PreprocessResult(
273303
processed_bytes=b"",
274304
debug_handle_map={},

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from . import arm_pass_utils # noqa
88
from .arm_pass import ArmPass # noqa # usort: skip
99
from .accumulate_index_put_pass import AccumulateIndexPutPass # noqa
10-
from .annotate_output_dim_order_pass import AnnotateOutputDimOrderPass # noqa
1110
from .broadcast_args_pass import BroadcastArgsPass # noqa
1211
from .canonicalize_gather_pass import CanonicalizeGatherPass # noqa
1312
from .cast_int64_pass import CastInt64BuffersToInt32Pass # noqa
@@ -61,9 +60,6 @@
6160
from .decompose_index_tensor_to_gather_pass import ( # noqa
6261
DecomposeIndexTensorToGatherPass,
6362
)
64-
from .decompose_int16_activation_conv_pass import ( # noqa
65-
DecomposeConvWithInt16ActivationPass,
66-
)
6763
from .decompose_int_pow_pass import DecomposeIntPowPass # noqa
6864
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa
6965
from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa
@@ -77,6 +73,7 @@
7773
from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass # noqa
7874
from .decompose_meandim_pass import DecomposeMeanDimPass # noqa
7975
from .decompose_ne_pass import DecomposeNotEqualPass # noqa
76+
from .decompose_permute_for_u55_pass import DecomposePermuteForU55Pass # noqa
8077
from .decompose_quant_nodes import DecomposeQuantNodesPass # noqa
8178
from .decompose_remainder_pass import DecomposeRemainderPass # noqa
8279
from .decompose_rnn_pass import DecomposeRnnPass # noqa
@@ -167,7 +164,6 @@
167164
from .rewrite_upsample import RewriteUpsamplePass # noqa
168165
from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa
169166
from .size_adjust_input_pass import SizeAdjustInputPass # noqa
170-
from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass # noqa
171167
from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass # noqa
172168
from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass # noqa
173169
from .replace_inf_and_limit_values_pass import ( # noqa # usort: skip

backends/arm/_passes/annotate_output_dim_order_pass.py

Lines changed: 0 additions & 28 deletions
This file was deleted.

backends/arm/_passes/arm_pass_manager.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
DecomposeMaxPool2dPass,
7676
DecomposeMeanDimPass,
7777
DecomposeNotEqualPass,
78+
DecomposePermuteForU55Pass,
7879
DecomposeQuantNodesPass,
7980
DecomposeRemainderPass,
8081
DecomposeRnnPass,
@@ -438,6 +439,7 @@ def _tosa_pipeline(
438439
ConvertSplitToSlicePass(),
439440
QuantizeClampArgumentsPass(),
440441
RemoveGetItemPass(),
442+
FuseBatchNorm2dPass(exported_program),
441443
DecomposeBatchNormNoStatsPass(),
442444
DecomposeLogitPass(),
443445
DecomposeMaskedFillPass(),
@@ -501,7 +503,6 @@ def _tosa_pipeline(
501503
RewriteBoolBitwiseToLogicalPass(),
502504
DecomposeRemainderPass(),
503505
DecomposeDivTensorModePass(),
504-
FuseBatchNorm2dPass(exported_program),
505506
ConvertMmToBmmPass(),
506507
DecomposeGluPass(),
507508
DecomposeDivPass(),
@@ -536,13 +537,14 @@ def _tosa_pipeline(
536537
RewriteConvPass(exported_program),
537538
RewriteMatmulPass(),
538539
RewritePadPass(),
539-
RewriteSlicePass(),
540540
FuseViewCopyTransformPass(),
541541
RemovePermutesAroundElementwiseOps(),
542542
PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView(),
543543
FuseCascadedTransposeOrPermuteOps(),
544544
ConvertPermuteSingletonToViewPass(),
545545
RewriteHighRankSingletonPermutePass(),
546+
DecomposePermuteForU55Pass(),
547+
RewriteSlicePass(),
546548
InsertConstShapesPass(),
547549
]
548550
)

backends/arm/_passes/arm_pass_utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -364,11 +364,6 @@ def set_node_arg(node: torch.fx.Node, i: int | str, value):
364364
raise RuntimeError("Invalid type")
365365

366366

367-
def get_output_dim_orders(graph_module):
368-
output_node = graph_module.graph.output_node()
369-
return [get_first_fake_tensor(node).dim_order() for node in output_node.args[0]]
370-
371-
372367
def is_nested_control_flow_graph(graph_module: GraphModule) -> bool:
373368
"""Returns True if graph_module is a nested control-flow graph."""
374369

0 commit comments

Comments
 (0)