low mem export controled by compile spec

Gasoonjia · Gasoonjia · commit a8ef2a4cbcdf · 2026-04-30T23:29:01.000-07:00
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
@@ -88,8 +88,12 @@ def save_data_externally(cls) -> bool:
         return False
 
     @classmethod
-    def get_extra_aoti_compile_context_manager(cls):
-        """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager."""
+    def get_extra_aoti_compile_context_manager(cls, compile_specs: List[CompileSpec]):
+        """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager.
+
+        Subclasses may inspect ``compile_specs`` to opt into behaviors that
+        only apply to specific methods/models (e.g. low-memory export).
+        """
         return contextlib.nullcontext()
 
     @classmethod
@@ -195,7 +199,7 @@ def preprocess(
         # Compile with fallback kernel collection
         with cls.collect_unsupported_fallback_kernels(
             missing_fallback_kernels
-        ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager():
+        ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager(compile_specs):
             paths = torch._inductor.aot_compile(
                 edge_program_module, tuple(user_input_placeholders), options=options
             )
diff --git a/backends/cuda/benchmarks/benchmark_int4_matvec.py b/backends/cuda/benchmarks/benchmark_int4_matvec.py
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -332,12 +332,28 @@ def get_aoti_compile_options(
         return options
 
     @classmethod
-    def get_extra_aoti_compile_context_manager(cls):
+    def get_extra_aoti_compile_context_manager(cls, compile_specs: List[CompileSpec]):
         """
         Combine all extra context managers needed during AOTInductor
         compilation for the CUDA backend. Each manager is documented at
         its own `enter_context` call site below.
+
+        The low-memory export monkey-patch (CPU clones for mutated buffers)
+        is gated on the ``low_memory_mode`` compile spec — only models that
+        explicitly opt in (currently Qwen3.5 MoE) get it. Other models go
+        through the unmodified AOTI codepath, which avoids regressions in
+        their cuda CI exports.
         """
+        # Parse compile_specs for low_memory_mode (default OFF)
+        low_memory_mode = "OFF"
+        for spec in compile_specs:
+            if spec.key == "low_memory_mode":
+                mode = spec.value.decode("utf-8").upper()
+                if mode not in ["ON", "OFF"]:
+                    raise ValueError(
+                        f"Invalid low_memory_mode: {mode}. Expected 'ON' or 'OFF'."
+                    )
+                low_memory_mode = mode
 
         @contextlib.contextmanager
         def _combined():
@@ -348,16 +364,30 @@ def _combined():
                 # `ReplaceEdgeOpWithTritonOpPass` are unaffected; this is
                 # only the fallback for the `triton_kernel_mode="OFF"` path.
                 stack.enter_context(torch.nn.attention.sdpa_kernel([SDPBackend.MATH]))
-                # Force AOTI's mutated-buffer clones onto CPU during compile
-                # so we stay under tight GPU memory caps (e.g. 24 GB on a
-                # consumer 4090). See `_compile_time_cpu_clones` for details.
-                stack.enter_context(
-                    _compile_time_cpu_clones(torch.device(cls.get_device_name()))
-                )
+                if low_memory_mode == "ON":
+                    # Force AOTI's mutated-buffer clones onto CPU during
+                    # compile so we stay under tight GPU memory caps (e.g.
+                    # 24 GB on a consumer 4090). See
+                    # `_compile_time_cpu_clones` for details. Only enabled
+                    # for models that explicitly opt in via the
+                    # `low_memory_mode="ON"` compile spec, since the
+                    # monkey-patch can interact poorly with other models'
+                    # AOTI compile pipelines.
+                    stack.enter_context(
+                        _compile_time_cpu_clones(torch.device(cls.get_device_name()))
+                    )
                 yield
 
         return _combined()
 
+    @staticmethod
+    def _is_low_memory_mode(compile_specs: List[CompileSpec]) -> bool:
+        """Return True if any compile spec opts into low-memory export."""
+        for spec in compile_specs:
+            if spec.key == "low_memory_mode":
+                return spec.value.decode("utf-8").upper() == "ON"
+        return False
+
     @classmethod
     def preprocess_multimethod(
         cls,
@@ -369,6 +399,11 @@ def preprocess_multimethod(
         between methods (e.g. decode then prefill). Inductor caches hold CUDA
         tensors from the first compilation, causing the second to OOM under
         tight VRAM caps (e.g. 24GB simulating an RTX 4090).
+
+        The aggressive cleanup (resizing every CUDA tensor's storage to 0)
+        is only enabled for methods that opt into ``low_memory_mode="ON"``
+        — it can otherwise break models that expect their CUDA tensors to
+        stay live across method preprocessing.
         """
         import gc
 
@@ -384,17 +419,17 @@ def preprocess_multimethod(
                 preprocess_result = cls.preprocess(program, compile_spec_for_program)
                 results_for_method.append(preprocess_result)
 
-                # Aggressive GPU cleanup between methods
+                # GPU cleanup between methods. Aggressive storage resize is
+                # only run for methods that opt into low-memory mode.
                 if torch.cuda.is_available():
-                    gc.collect()
-                    freed = 0
-                    for obj in gc.get_objects():
-                        if isinstance(obj, torch.Tensor) and obj.is_cuda:
-                            try:
-                                obj.untyped_storage().resize_(0)
-                                freed += 1
-                            except Exception:
-                                pass
+                    if cls._is_low_memory_mode(compile_spec_for_program):
+                        gc.collect()
+                        for obj in gc.get_objects():
+                            if isinstance(obj, torch.Tensor) and obj.is_cuda:
+                                try:
+                                    obj.untyped_storage().resize_(0)
+                                except Exception:
+                                    pass
                     gc.collect()
                     torch.cuda.empty_cache()
 
diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
@@ -702,30 +702,74 @@ def moe_align_block_size(
 # Autotune configs for batched GEMM1 (gate+up projection).
 # BLOCK_M is fixed at _BATCHED_BLOCK_M; only N and K are tuned.
 _BATCHED_GEMM1_CONFIGS = [
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16}, num_warps=4, num_stages=3),
     triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=2
+        {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=3,
     ),
     triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16}, num_warps=4, num_stages=2
+        {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=2,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16},
+        num_warps=4,
+        num_stages=2,
     ),
 ]
 
 # Autotune configs for batched GEMM2 (down projection + SiLU).
 _BATCHED_GEMM2_CONFIGS = [
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=2),
     triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=2
+        {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16},
+        num_warps=4,
+        num_stages=3,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=2,
+    ),
+    triton.Config(
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8},
+        num_warps=4,
+        num_stages=2,
     ),
     triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16}, num_warps=4, num_stages=2
+        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16},
+        num_warps=4,
+        num_stages=2,
     ),
 ]
 
@@ -831,7 +875,8 @@ def _fused_moe_batched_kernel(
                     B_scale
                     + expert_id * stride_bse
                     + offs_n[None, :] * stride_bsn
-                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size) * stride_bsk
+                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size)
+                    * stride_bsk
                 )
                 b_scale = tl.load(
                     scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
@@ -967,7 +1012,8 @@ def _fused_moe_batched_int8_kernel(
                     B_scale
                     + expert_id * stride_bse
                     + offs_n[None, :] * stride_bsn
-                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size) * stride_bsk
+                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size)
+                    * stride_bsk
                 )
                 b_scale = tl.load(
                     scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
@@ -1085,7 +1131,8 @@ def _fused_moe_silu_batched_kernel(
                     B_scale
                     + expert_id * stride_bse
                     + offs_n[None, :] * stride_bsn
-                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size) * stride_bsk
+                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size)
+                    * stride_bsk
                 )
                 b_scale = tl.load(
                     scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
@@ -1227,7 +1274,8 @@ def _fused_moe_silu_batched_int8_kernel(
                     B_scale
                     + expert_id * stride_bse
                     + offs_n[None, :] * stride_bsn
-                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size) * stride_bsk
+                    + ((offs_k[:, None] + BLOCK_SIZE_K * k_step) // group_size)
+                    * stride_bsk
                 )
                 b_scale = tl.load(
                     scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -886,6 +886,7 @@ def _export_cuda(model, config, args):
         ExecutorchBackendConfig,
         to_edge_transform_and_lower,
     )
+    from executorch.exir.backend.compile_spec_schema import CompileSpec
     from executorch.exir.passes import MemoryPlanningPass
     from torch.export import Dim, export
 
@@ -959,13 +960,15 @@ def _export_cuda(model, config, args):
                 CudaPartitioner(
                     [
                         CudaBackend.generate_method_name_compile_spec("decode"),
+                        CompileSpec("low_memory_mode", b"ON"),
                     ]
                 )
             ],
             "prefill": [
                 CudaPartitioner(
                     [
                         CudaBackend.generate_method_name_compile_spec("prefill"),
+                        CompileSpec("low_memory_mode", b"ON"),
                     ]
                 )
             ],

Original file line number	Diff line number	Diff line change
`@@ -886,6 +886,7 @@ def _export_cuda(model, config, args):`
`886`	`886`	`ExecutorchBackendConfig,`
`887`	`887`	`to_edge_transform_and_lower,`
`888`	`888`	`)`
	`889`	`+ from executorch.exir.backend.compile_spec_schema import CompileSpec`
`889`	`890`	`from executorch.exir.passes import MemoryPlanningPass`
`890`	`891`	`from torch.export import Dim, export`
`891`	`892`
`@@ -959,13 +960,15 @@ def _export_cuda(model, config, args):`
`959`	`960`	`CudaPartitioner(`
`960`	`961`	`[`
`961`	`962`	`CudaBackend.generate_method_name_compile_spec("decode"),`
	`963`	`+ CompileSpec("low_memory_mode", b"ON"),`
`962`	`964`	`]`
`963`	`965`	`)`
`964`	`966`	`],`
`965`	`967`	`"prefill": [`
`966`	`968`	`CudaPartitioner(`
`967`	`969`	`[`
`968`	`970`	`CudaBackend.generate_method_name_compile_spec("prefill"),`
	`971`	`+ CompileSpec("low_memory_mode", b"ON"),`
`969`	`972`	`]`
`970`	`973`	`)`
`971`	`974`	`],`