File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -338,6 +338,7 @@ def get_extra_aoti_compile_context_manager(cls):
338338 compilation for the CUDA backend. Each manager is documented at
339339 its own `enter_context` call site below.
340340 """
341+
341342 @contextlib .contextmanager
342343 def _combined ():
343344 with contextlib .ExitStack () as stack :
@@ -346,9 +347,7 @@ def _combined():
346347 # them. SDPA ops already replaced by Triton kernels via
347348 # `ReplaceEdgeOpWithTritonOpPass` are unaffected; this is
348349 # only the fallback for the `triton_kernel_mode="OFF"` path.
349- stack .enter_context (
350- torch .nn .attention .sdpa_kernel ([SDPBackend .MATH ])
351- )
350+ stack .enter_context (torch .nn .attention .sdpa_kernel ([SDPBackend .MATH ]))
352351 # Force AOTI's mutated-buffer clones onto CPU during compile
353352 # so we stay under tight GPU memory caps (e.g. 24 GB on a
354353 # consumer 4090). See `_compile_time_cpu_clones` for details.
@@ -387,7 +386,6 @@ def preprocess_multimethod(
387386
388387 # Aggressive GPU cleanup between methods
389388 if torch .cuda .is_available ():
390- pre_mem = torch .cuda .memory_allocated ()
391389 gc .collect ()
392390 freed = 0
393391 for obj in gc .get_objects ():
You can’t perform that action at this time.
0 commit comments