pytorch
diff --git a/‎py/torch_tensorrt/dynamo/_compiler.py‎
Lines changed: 48 additions & 98 deletions b/‎py/torch_tensorrt/dynamo/_compiler.py‎
Lines changed: 48 additions & 98 deletions
diff --git a/‎py/torch_tensorrt/dynamo/_settings.py‎
Lines changed: 2 additions & 2 deletions b/‎py/torch_tensorrt/dynamo/_settings.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎py/torch_tensorrt/dynamo/_tracer.py‎
Lines changed: 1 addition & 2 deletions b/‎py/torch_tensorrt/dynamo/_tracer.py‎
Lines changed: 1 addition & 2 deletions
@@ -5,7 +5,7 @@
 import os
 import platform
 import warnings
-from typing import Any, Callable, Collection, Dict, List, Optional, Sequence, Set, Tuple, Union
+from typing import Any, Callable, Collection, List, Optional, Sequence, Set, Tuple, Union
 
 import torch
 from torch.export import ExportedProgram
@@ -40,7 +40,6 @@
     post_lowering,
     pre_export_lowering,
 )
-
 from torch_tensorrt.dynamo.partitioning._resource_partitioner import (
     resource_partition,
 )
@@ -57,66 +56,32 @@
 
 logger = logging.getLogger(__name__)
 
-# Passes registered by extensions at import time.
-# Each entry is called after post_lowering and before TRT conversion.
-#
-# Pass contract:
-#   fn(exported_program=..., fx_module=..., logger=...)
-#
-# A pass may either:
-#   - mutate in place and return None, or
-#   - return (exported_program, fx_module) to replace the current objects.
-#
-# Extensions must never be imported by torch_tensorrt — they register
-# themselves here instead.
+# Extension hook registries — populated at import time by external modules
+# (e.g. torch_tensorrt.annotation).  torch_tensorrt itself never imports
+# those modules; they register themselves here instead.
 _compile_passes: List[Callable] = []
-
-# ExportedProgram attribute names that extensions want preserved across
-# run_decompositions() (which returns a fresh EP object, discarding any
-# custom attributes set before the call).  Extensions register attribute
-# names at import time via register_preserved_ep_attr().
 _preserved_ep_attrs: List[str] = []
-
-# Context-manager factories called around torch.export.export when an
-# nn.Module is traced.  Each factory receives (model, inputs) and returns
-# a context manager.  Registered by extensions at import time.
 _export_context_factories: List[Callable] = []
-
-# Hooks called after dynamo_trace() (torch.export) and before dynamo_compile().
-# Each hook receives (exported_program, inputs) and may return a new EP.
 _post_trace_hooks: List[Callable] = []
 
 
 def register_compile_pass(fn: Callable) -> None:
-    """Register a pre-TRT FX pass. Called by extensions at import time."""
+    """Register a pre-TRT FX pass called after post_lowering."""
     _compile_passes.append(fn)
 
 
 def register_preserved_ep_attr(name: str) -> None:
-    """Preserve a custom ExportedProgram attribute across run_decompositions().
-
-    Extensions that store state on ``exported_program.<name>`` before calling
-    ``torch_tensorrt.compile`` should register that attribute name here so the
-    compiler can copy it onto the new EP returned by ``run_decompositions()``.
-    """
+    """Preserve a custom ExportedProgram attribute across run_decompositions()."""
     _preserved_ep_attrs.append(name)
 
 
 def register_export_context(fn: Callable) -> None:
-    """Register a context factory ``fn(model, inputs)`` wrapping torch.export.export.
-
-    Called by extensions at import time so that custom state can be active
-    during the export step without modifying the tracer directly.
-    """
+    """Register a context factory ``fn(model, inputs)`` wrapping torch.export.export."""
     _export_context_factories.append(fn)
 
 
 def register_post_trace_hook(fn: Callable) -> None:
-    """Register a hook called after torch.export and before TRT compilation.
-
-    ``fn(exported_program, inputs)`` may return a new ExportedProgram to
-    replace the current one, or ``None`` to leave it unchanged.
-    """
+    """Register a hook called after torch.export and before TRT compilation."""
     _post_trace_hooks.append(fn)
 
 
@@ -174,6 +139,7 @@ def cross_compile_for_windows(
     enable_resource_partitioning: bool = _defaults.ENABLE_RESOURCE_PARTITIONING,
     cpu_memory_budget: Optional[int] = _defaults.CPU_MEMORY_BUDGET,
     dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
+    decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -251,6 +217,7 @@ def cross_compile_for_windows(
         enable_resource_partitioning (bool): Enable resource-aware partitioning. This is useful when the model is large and the CPU memory is limited.
         cpu_memory_budget (Optional[int]): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail.
         dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
+        decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -409,6 +376,7 @@ def cross_compile_for_windows(
         "enable_resource_partitioning": enable_resource_partitioning,
         "cpu_memory_budget": cpu_memory_budget,
         "dynamically_allocate_resources": dynamically_allocate_resources,
+        "decompose_attention": decompose_attention,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -428,16 +396,10 @@ def cross_compile_for_windows(
 
     settings = CompilationSettings(**compilation_options)
     logger.info("Compilation Settings: %s\n", settings)
-    # Preserve custom EP attributes across run_decompositions() — the returned
-    # EP is a fresh object and does not carry any attributes set by extensions.
-    _saved_ep_attrs = {
-        k: getattr(exported_program, k)
-        for k in _preserved_ep_attrs
-        if hasattr(exported_program, k)
-    }
     exported_program = pre_export_lowering(exported_program, settings)
+    _saved_ep_attrs = {k: getattr(exported_program, k) for k in _preserved_ep_attrs if hasattr(exported_program, k)}
     exported_program = exported_program.run_decompositions(
-        get_decompositions(enable_experimental_decompositions)
+        get_decompositions(enable_experimental_decompositions, decompose_attention)
     )
     for k, v in _saved_ep_attrs.items():
         try:
@@ -448,23 +410,23 @@ def cross_compile_for_windows(
     gm = exported_program.module()
     logger.debug("Input graph: " + str(gm.graph))
 
-    # Apply lowering on the graph module
+    # Apply lowering on the graph module. Note: constant_fold runs inside post_lowering and requires
+    # module parameters to still be on GPU, so we must not deallocate before this call.
     gm = post_lowering(gm, settings)
+    logger.debug(f"CPU memory usage after post_lowering: {get_cpu_memory_usage()} MB")
     logger.debug("Lowered Input graph: " + str(gm.graph))
     for _pass in _compile_passes:
-        _result = _pass(
-            exported_program=exported_program,
-            fx_module=gm,
-            logger=logger,
-        )
+        _result = _pass(exported_program=exported_program, fx_module=gm, logger=logger)
         if isinstance(_result, tuple):
             exported_program, gm = _result
+
     # Move the weights in the state_dict to CPU
     if offload_module_to_cpu:
-        deallocate_module(exported_program.module(), delete_module=False)
+        deallocate_module(gm)
         logger.info(
             "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
+        logger.debug(f"CPU memory usage after CPU offload: {get_cpu_memory_usage()} MB")
     else:
         remaining_memory, total_memory = torch.cuda.mem_get_info()
         if remaining_memory < total_memory // 2:
@@ -546,6 +508,7 @@ def compile(
     cpu_memory_budget: Optional[int] = _defaults.CPU_MEMORY_BUDGET,
     enable_resource_partitioning: bool = _defaults.ENABLE_RESOURCE_PARTITIONING,
     dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
+    decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
     profiling_verbosity: Optional[Any] = None,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
@@ -634,6 +597,7 @@ def compile(
         enable_resource_partitioning (bool): Enable resource-aware partitioning. This is useful when the model is large and the CPU memory is limited.
         cpu_memory_budget (Optional[int]): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail.
         dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
+        decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -648,7 +612,7 @@ def compile(
 
     if not kwargs.get("use_explicit_typing", False):
         warnings.warn(
-            "`use_explicit_typing` is deprecated. This setting will be removed and you should enable autocast instead.",
+            "`use_explicit_typing` is deprecated. use_explicit_types is now on by default, this setting will be removed and you should enable autocast to recover weak typing behavior.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -837,21 +801,17 @@ def compile(
         "enable_resource_partitioning": enable_resource_partitioning,
         "cpu_memory_budget": cpu_memory_budget,
         "dynamically_allocate_resources": dynamically_allocate_resources,
-        "profiling_verbosity": profiling_verbosity,
+        "decompose_attention": decompose_attention,
     }
+    if profiling_verbosity is not None:
+        compilation_options["profiling_verbosity"] = profiling_verbosity
     logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
     settings = CompilationSettings(**compilation_options)
     logger.info("Compilation Settings: %s\n", settings)
-    # Preserve custom EP attributes across run_decompositions() — the returned
-    # EP is a fresh object and does not carry any attributes set by extensions.
-    _saved_ep_attrs = {
-        k: getattr(exported_program, k)
-        for k in _preserved_ep_attrs
-        if hasattr(exported_program, k)
-    }
     exported_program = pre_export_lowering(exported_program, settings)
+    _saved_ep_attrs = {k: getattr(exported_program, k) for k in _preserved_ep_attrs if hasattr(exported_program, k)}
     exported_program = exported_program.run_decompositions(
-        get_decompositions(enable_experimental_decompositions)
+        get_decompositions(enable_experimental_decompositions, decompose_attention)
     )
     for k, v in _saved_ep_attrs.items():
         try:
@@ -863,24 +823,19 @@ def compile(
     # Move the weights in the state_dict to CPU
     logger.debug("Input graph: " + str(gm.graph))
 
-    # Apply lowering on the graph module
+    # Apply lowering on the graph module. Note: constant_fold runs inside post_lowering and requires
+    # module parameters to still be on GPU, so we must not deallocate before this call.
     gm = post_lowering(gm, settings)
     logger.debug(f"CPU memory usage after post_lowering: {get_cpu_memory_usage()} MB")
     logger.debug("Lowered Input graph: " + str(gm.graph))
-
     for _pass in _compile_passes:
-        _result = _pass(
-            exported_program=exported_program,
-            fx_module=gm,
-            logger=logger,
-        )
+        _result = _pass(exported_program=exported_program, fx_module=gm, logger=logger)
         if isinstance(_result, tuple):
             exported_program, gm = _result
 
     # Move the weights in the state_dict to CPU
     if offload_module_to_cpu:
-        deallocate_module(gm, delete_module=False)
-        deallocate_module(exported_program.module(), delete_module=False)
+        deallocate_module(gm)
         logger.info(
             "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
@@ -892,11 +847,7 @@ def compile(
                 "Remaining GPU memory may not be enough to compile the TensorRT engine for this model resulting in an OOM error, Consider setting offload_module_to_cpu=True"
             )
     trt_gm = compile_module(
-        gm,
-        trt_arg_inputs,
-        trt_kwarg_inputs,
-        settings,
-        engine_cache,
+        gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
     )
     return trt_gm
 
@@ -1152,7 +1103,6 @@ def preserve_module_specs(
             trt_modules[name] = trt_module
 
             if _debugger_config:
-
                 if _debugger_config.save_engine_profile:
                     if settings.use_python_runtime:
                         if _debugger_config.profile_format != "cudagraph":
@@ -1267,6 +1217,7 @@ def convert_exported_program_to_serialized_trt_engine(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1341,6 +1292,7 @@ def convert_exported_program_to_serialized_trt_engine(
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model.
+        decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
         **kwargs: Any,
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
@@ -1510,20 +1462,15 @@ def convert_exported_program_to_serialized_trt_engine(
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "decompose_attention": decompose_attention,
     }
 
     settings = CompilationSettings(**compilation_options)
     logger.info("Compilation Settings: %s\n", settings)
-    # Preserve custom EP attributes across run_decompositions() — the returned
-    # EP is a fresh object and does not carry any attributes set by extensions.
-    _saved_ep_attrs = {
-        k: getattr(exported_program, k)
-        for k in _preserved_ep_attrs
-        if hasattr(exported_program, k)
-    }
     exported_program = pre_export_lowering(exported_program, settings)
+    _saved_ep_attrs = {k: getattr(exported_program, k) for k in _preserved_ep_attrs if hasattr(exported_program, k)}
     exported_program = exported_program.run_decompositions(
-        get_decompositions(enable_experimental_decompositions)
+        get_decompositions(enable_experimental_decompositions, decompose_attention)
     )
     for k, v in _saved_ep_attrs.items():
         try:
@@ -1539,17 +1486,13 @@ def convert_exported_program_to_serialized_trt_engine(
     gm = post_lowering(gm, settings)
     logger.debug("Lowered Input graph: " + str(gm.graph))
     for _pass in _compile_passes:
-        _result = _pass(
-            exported_program=exported_program,
-            fx_module=gm,
-            logger=logger,
-        )
+        _result = _pass(exported_program=exported_program, fx_module=gm, logger=logger)
         if isinstance(_result, tuple):
             exported_program, gm = _result
 
     # Move the weights in the state_dict to CPU
     if offload_module_to_cpu:
-        deallocate_module(exported_program.module(), delete_module=False)
+        deallocate_module(exported_program.module())
         logger.info(
             "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
@@ -1560,6 +1503,9 @@ def convert_exported_program_to_serialized_trt_engine(
                 "Remaining GPU memory may not be enough to compile the TensorRT engine for this model resulting in an OOM error, Consider setting offload_module_to_cpu=True"
             )
 
+    if trt_kwarg_inputs is None:
+        trt_kwarg_inputs = {}
+
     flattened_input_list = get_flat_args_with_check(
         exported_program, list(trt_arg_inputs), trt_kwarg_inputs
     )[0]
@@ -1571,16 +1517,20 @@ def convert_exported_program_to_serialized_trt_engine(
             settings=settings,
             engine_cache=engine_cache,
         )
-    except UnsupportedOperatorException:
+    except UnsupportedOperatorException as e:
         logger.error(
             f"Conversion of module {gm} not currently fully supported or convertible!",
             exc_info=True,
         )
+        raise UnsupportedOperatorException(
+            f"Conversion of module {gm} not currently fully supported or convertible!"
+        ) from e
     except Exception as e:
         logger.error(
             f"While interpreting the module got an error: {e}",
             exc_info=True,
         )
+        raise RuntimeError(f"While interpreting the module got an error: {e}") from e
 
     serialized_engine: bytes = interpreter_result.serialized_engine
     return serialized_engine
 
@@ -16,13 +16,13 @@
     AUTOCAST_MAX_OUTPUT_THRESHOLD,
     CACHE_BUILT_ENGINES,
     CPU_MEMORY_BUDGET,
+    DECOMPOSE_ATTENTION,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
     DLA_SRAM_SIZE,
-    DECOMPOSE_ATTENTION,
-    DYNAMICALLY_ALLOCATE_RESOURCES,
     DRYRUN,
+    DYNAMICALLY_ALLOCATE_RESOURCES,
     ENABLE_AUTOCAST,
     ENABLE_CROSS_COMPILE_FOR_WINDOWS,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
 
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from contextlib import ExitStack
 from inspect import signature
 from typing import Any, Optional, Tuple, Union
 
@@ -76,8 +77,6 @@ def trace(
     # Constructing dynamic shape list as a nested dict
     dynamic_shapes = get_dynamic_shapes_args(mod, arg_inputs)
     dynamic_shapes.update(get_dynamic_shapes_kwargs(kwarg_inputs))
-    # Apply any registered export-context factories.
-    from contextlib import ExitStack
     from torch_tensorrt.dynamo._compiler import _export_context_factories
     with ExitStack() as _stack:
         for _factory in _export_context_factories: