refactor: minimize changes outside annotation folder

Bowen Fu · claude · Bowen Fu · commit 89bfbdfe95ad · 2026-03-30T12:57:00.000Z
Revert non-essential modifications to core torch_tensorrt files,
keeping only what TTA strictly requires:

_compile.py:
- Restore module_type == _ModuleType.ep branch (preserve EP input handling)
- Restore load() with extra_files/kwargs support
- Restore save() with all original params (extra_files, use_legacy_exporter,
  dynamic_shapes, Input type annotations, full docstring)
- Restore original imports (inspect, Dict/Tuple, default_device, etc.)
- Keep only the post-trace hook loop as the TTA addition

_defaults.py / _settings.py:
- Remove editable_timing_cache, error_on_timing_cache_miss (autotune, out of scope)
- Restore DECOMPOSE_ATTENTION and decompose_attention field
- Restore cpu_memory_budget: Optional[int]
- Keep profiling_verbosity (needed for ILayer.metadata inspection)

_TRTInterpreter.py:
- Remove algorithm_selector parameter (autotune, out of scope)
- Remove _mark_debug_candidates / mark_debug logic (debug feature, out of scope)
- Remove editable_timing_cache / error_on_timing_cache_miss flag handling

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
 import collections.abc
+import inspect
 import logging
 import platform
 import warnings
 from enum import Enum
-from typing import Any, Callable, List, Optional, Sequence, Set, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
 
 import torch
 from torch_tensorrt._enums import dtype
@@ -23,9 +24,9 @@
     from torch_tensorrt.fx.lower import compile as fx_compile
     from torch_tensorrt.fx.utils import LowerPrecision
 
-    InputType = Union[Input, torch.Tensor, InputTensorSpec]
-else:
     InputType = Union[Input, torch.Tensor]
+else:
+    InputType = Union[Input, torch.Tensor]  # type: ignore
 
 if ENABLED_FEATURES.torchscript_frontend:
     import torch_tensorrt.ts
@@ -49,7 +50,13 @@
     from torch_tensorrt.dynamo._compiler import (
         save_cross_compiled_exported_program as dynamo_save_cross_compiled_exported_program,
     )
+    from torch_tensorrt.dynamo._defaults import default_device
+    from torch_tensorrt.dynamo._tracer import (
+        get_dynamic_shapes_args,
+        get_dynamic_shapes_kwargs,
+    )
     from torch_tensorrt.dynamo._tracer import trace as dynamo_trace
+    from torch_tensorrt.dynamo.utils import get_torch_inputs
 
 logger = logging.getLogger(__name__)
 
@@ -175,7 +182,7 @@ def compile(
     ir: str = "default",
     inputs: Optional[Sequence[InputType]] = None,
     arg_inputs: Optional[Sequence[Sequence[Any]]] = None,
-    kwarg_inputs: Optional[dict[Any, Any]] = None,
+    kwarg_inputs: Optional[Dict[str, Any]] = None,
     enabled_precisions: Optional[Set[Union[torch.dtype, dtype]]] = None,
     **kwargs: Any,
 ) -> (
@@ -301,13 +308,18 @@ def _fx_input_interface(
         if not isinstance(arg_inputs, collections.abc.Sequence):
             arg_inputs = [arg_inputs]  # type: ignore
 
-        # Export the module
         torchtrt_arg_inputs = prepare_inputs(arg_inputs)
         torchtrt_kwarg_inputs = prepare_inputs(kwarg_inputs)
 
-        exp_program = dynamo_trace(
-            module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs
-        )
+        if module_type == _ModuleType.ep:
+            exp_program = module
+        else:
+            exp_program = dynamo_trace(
+                module,
+                torchtrt_arg_inputs,
+                kwarg_inputs=torchtrt_kwarg_inputs,
+                **kwargs,
+            )
         # Run post-trace hooks.
         from torch_tensorrt.dynamo._compiler import _post_trace_hooks
         for _hook in _post_trace_hooks:
@@ -329,7 +341,7 @@ def _fx_input_interface(
         raise RuntimeError("Module is an unknown format or the ir requested is unknown")
 
 
-@needs_cross_compile
+@needs_cross_compile  # type: ignore[misc]
 def cross_compile_for_windows(
     module: torch.nn.Module,
     file_path: str,
@@ -567,36 +579,56 @@ def load_cross_compiled_exported_program(file_path: str = "") -> Any:
     return dynamo_load_cross_compiled_exported_program(file_path)
 
 
-def load(file_path: str = "") -> Any:
+def load(
+    file_path: str = "", extra_files: Optional[dict[str, Any]] = None, **kwargs: Any
+) -> Any:
     """
     Load either a Torchscript model or ExportedProgram.
 
     Loads a TorchScript or ExportedProgram file from disk. File type will be detect the type using try, except.
 
     Arguments:
         file_path (str): Path to file on the disk
+        extra_files (dict[str, Any]): Extra files to load with the model
+
+    Example:
+    # Load with extra files.
+        extra_files = {"foo.txt": ""}  # values will be replaced with serialized data
+        ep = torch.export.load("exported_program.pt2", extra_files=extra_files)
+        print(extra_files["foo.txt"])
 
     Raises:
         ValueError: If there is no file or the file is not either a TorchScript file or ExportedProgram file
     """
+
     try:
         logger.debug(f"Loading the provided file {file_path} using torch.jit.load()")
-        ts_module = torch.jit.load(file_path)
+        ts_module = function_overload_with_kwargs(
+            torch.export.load,
+            file_path,
+            extra_files=extra_files,
+            **kwargs,
+        )
         return ts_module
     except Exception:
         logger.info(
-            f"Loading the provided file {file_path} via torch.jit.load() failed with the following error",
+            f"Loading the provided file {file_path} via torch.export.load() failed with the following error",
             exc_info=True,
         )
         pass
 
     try:
         logger.debug(f"Loading the provided file {file_path} using torch.export.load()")
-        exp_program = torch.export.load(file_path)
+        exp_program = function_overload_with_kwargs(
+            torch.jit.load,
+            file_path,
+            _extra_files=extra_files,
+            **kwargs,
+        )
         return exp_program
     except Exception:
         logger.info(
-            f"Loading the provided file {file_path} via torch.export.load() failed with the following error",
+            f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error",
             exc_info=True,
         )
         raise ValueError(
@@ -608,36 +640,104 @@ def save(
     module: Any,
     file_path: str = "",
     *,
+    extra_files: Optional[dict[str, str]] = None,
     output_format: str = "exported_program",
-    inputs: Optional[Sequence[torch.Tensor]] = None,
-    arg_inputs: Optional[Sequence[torch.Tensor]] = None,
-    kwarg_inputs: Optional[dict[str, Any]] = None,
+    inputs: Optional[Sequence[torch.Tensor | Input]] = None,
+    arg_inputs: Optional[Sequence[torch.Tensor | Input]] = None,
+    kwarg_inputs: Optional[Dict[str, Any]] = None,
     retrace: bool = True,
+    use_legacy_exporter: Optional[bool] = None,
     pickle_protocol: int = 2,
+    dynamic_shapes: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> None:
     """
     Save the model to disk in the specified output format.
 
     Arguments:
         module (Optional(torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule | CudaGraphsTorchTensorRTModule)): Compiled Torch-TensorRT module
-        inputs (torch.Tensor): Torch input tensors
-        arg_inputs (Tuple[Any, ...]): Same as inputs. Alias for better understanding with kwarg_inputs.
-        kwarg_inputs (dict[Any, ...]): Optional, kwarg inputs to the module forward function.
+        inputs (Union[torch.Tensor, torch_tensorrt.Input]): Torch input tensors or Input specifications
+        arg_inputs (Tuple[Union[torch.Tensor, torch_tensorrt.Input], ...]): Same as inputs. Alias for better understanding with kwarg_inputs.
+        kwarg_inputs (dict[str, Union[torch.Tensor, torch_tensorrt.Input]]): Optional, kwarg inputs to the module forward function.
         output_format (str): Format to save the model. Options include exported_program | torchscript | aot_inductor.
         retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it.
-                This flag is experimental for now.
+
+                For TRT-compiled modules with dynamic shapes, both retrace=True and retrace=False are supported:
+
+                - **retrace=True**: Automatically detects symbolic shape metadata in the TRT module and preserves it
+                  without retracing. This is the recommended approach as it maintains the exact symbolic shapes
+                  from the original compilation.
+
+                - **retrace=False**: Directly serializes the existing graph metadata without any re-export.
+                  This is faster but may not be compatible with all torch.export consumers.
+
+                For static shape models, retrace=True performs a standard torch.export.export() call.
+
+        use_legacy_exporter (Optional[bool]): Override the exporter used when serializing a torch.fx.GraphModule.
+                By default (None) the choice is made automatically:
+
+                - ``retrace=False`` always uses the legacy exporter (pure graph surgery, no re-execution).
+                - ``retrace=True`` with dynamic shapes uses ``torch.export.export`` on the inlined graph,
+                  which produces a fully standards-compliant ExportedProgram.
+
+                Set to ``True`` to force the legacy exporter regardless of ``retrace``.
+                Set to ``False`` to force ``torch.export.export`` on the inlined graph; this requires
+                example inputs and a live CUDA device.
+
         pickle_protocol (int): The pickle protocol to use to save the model. Default is 2. Increase this to 4 or higher for large models
+        dynamic_shapes (Optional[Union[dict[str, Any], tuple[Any, ...]]]): Dynamic shape specifications for re-exporting the model.
+
+                **Method 1: Explicit dynamic_shapes (torch.export style)**
+
+                Provide explicit torch.export.Dim specifications::
+
+                    # For a single input with dynamic batch dimension
+                    dyn_batch = torch.export.Dim("batch", min=1, max=32)
+                    dynamic_shapes = {"x": {0: dyn_batch}}
+                    torch_tensorrt.save(model, "model.ep", arg_inputs=[example_tensor], dynamic_shapes=dynamic_shapes)
+
+                    # For multiple inputs
+                    dynamic_shapes = ({"x": {0: dyn_batch}}, {"y": {0: dyn_batch}})
+
+                **Method 2: Inferred from torch_tensorrt.Input**
+
+                Pass torch_tensorrt.Input objects with min/opt/max shapes in arg_inputs/kwarg_inputs,
+                and dynamic_shapes will be inferred automatically::
+
+                    inputs = [
+                        torch_tensorrt.Input(
+                            min_shape=(1, 3, 224, 224),
+                            opt_shape=(8, 3, 224, 224),
+                            max_shape=(32, 3, 224, 224),
+                            name="x"  # Optional: name for better dim naming
+                        )
+                    ]
+                    torch_tensorrt.save(model, "model.ep", arg_inputs=inputs)  # dynamic_shapes inferred!
+
+                **Important Limitations:**
+
+                - Automatic inference creates **separate Dim objects for each input**. If your model requires
+                  multiple inputs to share the same dimension (e.g., matching batch sizes), you MUST use
+                  Method 1 with explicit shared Dim objects::
+
+                      batch = torch.export.Dim("batch", min=1, max=8)
+                      dynamic_shapes = {"x": {0: batch}, "mask": {0: batch}}  # Shared batch dimension
+
+                - Automatic inference is **disabled for mixed Input/Tensor inputs** to avoid spurious
+                  equality constraints. Use explicit dynamic_shapes for these cases.
+
+                - If both dynamic_shapes and Input objects are provided, the explicit dynamic_shapes
+                  parameter takes precedence.
     """
     if isinstance(module, CudaGraphsTorchTensorRTModule):
         module = module.compiled_module
     module_type = _parse_module_type(module)
     accepted_formats = {"exported_program", "torchscript", "aot_inductor"}
     if arg_inputs is not None and not all(
-        isinstance(input, torch.Tensor) for input in arg_inputs
+        isinstance(input, (torch.Tensor, Input)) for input in arg_inputs
     ):
         raise ValueError(
-            "Not all inputs provided are torch.tensors. Please provide torch.tensors as inputs"
+            "Not all inputs provided are torch.Tensor or torch_tensorrt.Input objects. Please provide inputs of a valid type"
         )
     if arg_inputs and inputs:
         raise AssertionError(
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -55,8 +55,6 @@
 ENABLE_CROSS_COMPILE_FOR_WINDOWS = False
 TILING_OPTIMIZATION_LEVEL = "none"
 L2_LIMIT_FOR_TILING = -1
-EDITABLE_TIMING_CACHE = False
-ERROR_ON_TIMING_CACHE_MISS = False
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
 ENABLE_AUTOCAST = False
@@ -69,6 +67,7 @@
 ENABLE_RESOURCE_PARTITIONING = False
 CPU_MEMORY_BUDGET = None
 DYNAMICALLY_ALLOCATE_RESOURCES = False
+DECOMPOSE_ATTENTION = False
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -20,17 +20,16 @@
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
     DLA_SRAM_SIZE,
+    DECOMPOSE_ATTENTION,
     DYNAMICALLY_ALLOCATE_RESOURCES,
     DRYRUN,
-    EDITABLE_TIMING_CACHE,
     ENABLE_AUTOCAST,
     ENABLE_CROSS_COMPILE_FOR_WINDOWS,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLE_RESOURCE_PARTITIONING,
     ENABLE_WEIGHT_STREAMING,
     ENABLED_PRECISIONS,
     ENGINE_CAPABILITY,
-    ERROR_ON_TIMING_CACHE_MISS,
     HARDWARE_COMPATIBLE,
     IMMUTABLE_WEIGHTS,
     L2_LIMIT_FOR_TILING,
@@ -110,8 +109,6 @@ class CompilationSettings:
             True will enable cross-platform compatibility which allows the engine to be built on Linux and run on Windows
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
-        editable_timing_cache (bool): Allow TensorRT to write new timing measurements into the timing cache during build (TRT 10.8+). Enable this on the first run so the cache is fully populated; subsequent runs can then load the cache and reproduce the same tactic selection. Default: False.
-        error_on_timing_cache_miss (bool): Raise a build error if any tactic's timing is not found in the loaded timing cache (TRT 10.8+). Use in combination with a pre-populated ``timing_cache_path`` to guarantee that no re-profiling occurs and tactic selection is identical to the seed run, producing bitwise-identical engines. Default: False.
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
         autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
@@ -122,6 +119,7 @@ class CompilationSettings:
         autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
         offload_module_to_cpu (bool): Offload the model to CPU to reduce memory footprint during compilation
         dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines
+        decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -163,8 +161,6 @@ class CompilationSettings:
     enable_cross_compile_for_windows: bool = ENABLE_CROSS_COMPILE_FOR_WINDOWS
     tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
-    editable_timing_cache: bool = EDITABLE_TIMING_CACHE
-    error_on_timing_cache_miss: bool = ERROR_ON_TIMING_CACHE_MISS
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
     enable_autocast: bool = ENABLE_AUTOCAST
@@ -181,8 +177,9 @@ class CompilationSettings:
         AUTOCAST_CALIBRATION_DATALOADER
     )
     enable_resource_partitioning: bool = ENABLE_RESOURCE_PARTITIONING
-    cpu_memory_budget: int = CPU_MEMORY_BUDGET
+    cpu_memory_budget: Optional[int] = CPU_MEMORY_BUDGET
     dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES
+    decompose_attention: bool = DECOMPOSE_ATTENTION
     profiling_verbosity: Optional[Any] = None
 
     def __getstate__(self) -> dict[str, Any]:
@@ -223,6 +220,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
     "autocast_max_output_threshold",
     "autocast_max_depth_of_reduction",
     "autocast_calibration_dataloader",
+    "decompose_attention",
 }
 
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py