Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -415,12 +415,38 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then

# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
echo "::group::Export"
EXPORT_LOG=$(mktemp)
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
python -m executorch.examples.models.qwen3_5_moe.export \
--prequantized "$LOCAL_MODEL_DIR" \
--output-dir "${OUTPUT_DIR}"
--output-dir "${OUTPUT_DIR}" 2>&1 | tee "$EXPORT_LOG"
EXPORT_RC=${PIPESTATUS[0]}
echo "::endgroup::"

if [ "$EXPORT_RC" -ne 0 ]; then
echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)"
rm -f "$EXPORT_LOG"
exit "$EXPORT_RC"
fi

# Gate peak GPU memory so we keep the export viable on consumer GPUs
# (e.g. RTX 4090 with 24 GB). The export script prints a machine-
# parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}"
PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1)
rm -f "$EXPORT_LOG"
if [ -z "$PEAK_LINE" ]; then
echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
exit 1
fi
PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}')
echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then
echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
echo " — this would prevent the model from being exported on a 24 GB consumer GPU."
exit 1
fi

test -f "${OUTPUT_DIR}/model.pte"
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
ls -al "${OUTPUT_DIR}"
Expand Down
157 changes: 144 additions & 13 deletions backends/cuda/cuda_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
# LICENSE file in the root directory of this source tree.


import contextlib
import logging
import os
import shutil
import threading
import typing
from importlib import resources
from typing import Any, Dict, final, List, Optional
Expand All @@ -27,6 +29,83 @@
from torch.nn.attention import SDPBackend


# ---------------------------------------------------------------------------
# AOTI compile-time CPU clones for mutated buffers
# ---------------------------------------------------------------------------
#
# Inductor's `_unlift_graph` clones every mutated buffer that gets lifted into
# the AOTI graph. By default it clones on whatever device the original tensor
# lives on — which after `move_to_device_pass` is CUDA. For Large models like
# Qwen3.5-MoE that means an extra ~18 GB GPU clone during compile, blowing past
# the 24 GB cap we want to honor for consumer GPUs (RTX 4090 and similar).
#
# The patch below side-steps that by:
# 1. Wrapping `torch._inductor.compile_fx.clone_preserve_strides` so every
# clone the AOTI compile pipeline produces lands on CPU.
# 2. Wrapping `CppWrapperCpu.codegen_device` so the C++ wrapper still records
# the model's original target device (e.g. cuda) in `constants_info_`,
# not the now-CPU storage device. Without this the runtime would refuse
# to load the constants because of a mixed-device mismatch.
#
# The wrappers are scoped via a thread-local guard and are only active while
# `_compile_time_cpu_clones(...)` is on the call stack — they are inert
# anywhere else in the process.

_CPU_CLONE_GUARD = threading.local()


def _is_cpu_clone_active() -> bool:
return getattr(_CPU_CLONE_GUARD, "active", False)


@contextlib.contextmanager
def _compile_time_cpu_clones(target_device: torch.device):
"""Force AOTI's mutated-buffer clones onto CPU while preserving the
serialized constants' target device."""
from torch._inductor import compile_fx as _cfx
from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu as _Cpp

orig_clone = _cfx.clone_preserve_strides
orig_codegen_device = _Cpp.codegen_device

def _cpu_clone_preserve_strides(x: torch.Tensor) -> torch.Tensor:
# `clone_preserve_strides` is shared by `_unlift_graph` (clones
# lifted buffers — can be safely kept on CPU) and by autotuning code
# in `triton_heuristics.py` (clones for benchmark — must stay on
# GPU for Triton). Discriminate by caller frame so we only force
# CPU clones for the buffer-lifting path.
import sys

caller = sys._getframe(1).f_code.co_name
if caller == "_unlift_graph":
return orig_clone(x).cpu()
return orig_clone(x)

def _codegen_device_target_aware(self, device):
# Translate accidental CPU device strings back to the model target
# device only when a constant we forced to CPU is being serialized.
# Other code paths (extern op args etc.) are pass-through.
if (
_is_cpu_clone_active()
and self.device != "cpu"
and isinstance(device, torch.device)
and device.type == "cpu"
):
device = target_device
return orig_codegen_device(self, device)

_cfx.clone_preserve_strides = _cpu_clone_preserve_strides
_Cpp.codegen_device = _codegen_device_target_aware
prev_active = getattr(_CPU_CLONE_GUARD, "active", False)
_CPU_CLONE_GUARD.active = True
try:
yield
finally:
_CPU_CLONE_GUARD.active = prev_active
_cfx.clone_preserve_strides = orig_clone
_Cpp.codegen_device = orig_codegen_device


@final
@experimental(
"This API and all of cuda backend related functionality are experimental."
Expand Down Expand Up @@ -255,17 +334,69 @@ def get_aoti_compile_options(
@classmethod
def get_extra_aoti_compile_context_manager(cls):
"""
Return SDPA MATH backend context manager for CUDA compilation.

This context manager plays as a fallback solution for any remaining PyTorch SDPA
operations to use the MATH backend (decomposed SDPA) during AOTInductor compilation.

Note:
- If SDPA ops are replaced with Triton kernels by ReplaceEdgeOpWithTritonOpPass,
this context manager will have no effect on those ops (they are no longer
PyTorch SDPA ops).
- If SDPA ops are NOT replaced (e.g., when triton_kernel_mode="OFF"), this
context manager will force them to use the MATH backend, causing them to
be automatically decomposed during compilation.
Combine all extra context managers needed during AOTInductor
compilation for the CUDA backend. Each manager is documented at
its own `enter_context` call site below.
"""

@contextlib.contextmanager
def _combined():
with contextlib.ExitStack() as stack:
# Force any remaining PyTorch SDPA ops to use the MATH
# backend during compilation so AOTI can lower / decompose
# them. SDPA ops already replaced by Triton kernels via
# `ReplaceEdgeOpWithTritonOpPass` are unaffected; this is
# only the fallback for the `triton_kernel_mode="OFF"` path.
stack.enter_context(torch.nn.attention.sdpa_kernel([SDPBackend.MATH]))
# Force AOTI's mutated-buffer clones onto CPU during compile
# so we stay under tight GPU memory caps (e.g. 24 GB on a
# consumer 4090). See `_compile_time_cpu_clones` for details.
stack.enter_context(
_compile_time_cpu_clones(torch.device(cls.get_device_name()))
)
yield

return _combined()

@classmethod
def preprocess_multimethod(
cls,
edge_programs,
compile_specs,
):
"""
Override of base preprocess_multimethod to run aggressive GPU cleanup
between methods (e.g. decode then prefill). Inductor caches hold CUDA
tensors from the first compilation, causing the second to OOM under
tight VRAM caps (e.g. 24GB simulating an RTX 4090).
"""
return torch.nn.attention.sdpa_kernel([SDPBackend.MATH])
import gc

preprocess_results = {}
for method_name, programs in edge_programs.items():
assert method_name in compile_specs
compile_specs_for_method = compile_specs[method_name]
assert len(compile_specs_for_method) == len(programs)
results_for_method = []
for program, compile_spec_for_program in zip(
programs, compile_specs_for_method
):
preprocess_result = cls.preprocess(program, compile_spec_for_program)
results_for_method.append(preprocess_result)

# Aggressive GPU cleanup between methods
if torch.cuda.is_available():
gc.collect()
freed = 0
for obj in gc.get_objects():
if isinstance(obj, torch.Tensor) and obj.is_cuda:
try:
obj.untyped_storage().resize_(0)
freed += 1
except Exception:
pass
gc.collect()
torch.cuda.empty_cache()

preprocess_results[method_name] = results_for_method
return preprocess_results
14 changes: 14 additions & 0 deletions examples/models/qwen3_5_moe/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,13 @@ def main(): # noqa: C901
# Register FLA Triton kernel (CUDA only)
import executorch.backends.cuda.triton.kernels # noqa: F401

# Reset peak GPU memory stats so we can report the actual peak
# consumed during the export pipeline (load + quantize + lowering)
# at the very end. This is also gated by CI to make sure low-VRAM
# GPUs (e.g. RTX 4090, 24 GB) can still complete the export.
if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats(0)

if args.backend == "mlx":
if args.prequantized:
parser.error("--prequantized is not supported with --backend mlx")
Expand All @@ -989,6 +996,13 @@ def main(): # noqa: C901

export_and_lower(model, config, args)

# Report peak GPU memory consumed during the export so CI / users can
# gate this against a known budget (e.g. 24 GB consumer GPUs).
if args.backend == "cuda" and torch.cuda.is_available():
peak_mb = torch.cuda.max_memory_allocated(0) / (1024 * 1024)
# Stable, machine-parseable marker for CI grep.
print(f"EXPORT_GPU_PEAK_MEMORY_MB: {peak_mb:.2f}")


if __name__ == "__main__":
main()
Loading