feat(annotation/e2e): add LLM-domain, dynamic-shape, and cross-backend E2E tests

Bowen Fu · claude · Bowen Fu · commit 493cf05d5998 · 2026-04-02T07:35:47.000Z
New tests in _BackendE2ETests mixin (×3 backends):
  test_llm_hidden_unary/binary   : activation/gating at [batch=8, hidden=512]
  test_dynamic_batch             : dynamic batch dim via torch_tensorrt.Input
  test_gated_ffn_llm             : full FFN at hidden=256, inter=512 (2× LLM ratio)

New TestCrossBackendE2E class (9 tests):
  Verifies that Triton, CuTile, and CuTeDSL PluginV3 ops coexist in one engine.
  All at LLM-domain [batch=8, hidden=512].

Bug fixes triggered by dynamic-shape tests:
  _qdp_utils.py: _collect_shape_var_bindings now binds non-constant ShapeExpr
    dims (is_constant=False) so meta_impl runs during get_output_shapes for
    dynamic-shape engines.
  _qdp_utils.py: _safe_dim default changed from 256 → 1 so CuTeDSL dummy
    tensors compiled for dynamic dims use a [1, static_dim] layout whose
    offset formula (offset=idx) is correct for any batch size at runtime.
    (Using 256 caused the kernel to access only 8/256 columns, giving 96.8%
    output mismatch.)
  _triton.py: use _as_symint32() for grid_x/y/z assignment (matching CuTile
    and CuTeDSL) so _ShapeDim/ShapeExpr values are properly wrapped before
    being stored in KernelLaunchParams. Previously, direct assignment caused
    a TRT segfault during dynamic-shape engine builds.

E2E test count: 19 → 40 (50.6% of 79 total, up from 32%).

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/py/torch_tensorrt/annotation/_custom_plugin/_aot/_triton.py b/py/torch_tensorrt/annotation/_custom_plugin/_aot/_triton.py
@@ -378,13 +378,13 @@ def _to_int(x: Any) -> Any:
     # extra_args from shape_expr-derived scalars; only tile sizes in constexprs.
     launch = trtp.KernelLaunchParams()
     if isinstance(grid, tuple):
-        launch.grid_x = grid[0] if len(grid) >= 1 else 1
-        launch.grid_y = grid[1] if len(grid) >= 2 else 1
-        launch.grid_z = grid[2] if len(grid) >= 3 else 1
+        launch.grid_x = _as_symint32(grid[0]) if len(grid) >= 1 else trtp.SymInt32(1)
+        launch.grid_y = _as_symint32(grid[1]) if len(grid) >= 2 else trtp.SymInt32(1)
+        launch.grid_z = _as_symint32(grid[2]) if len(grid) >= 3 else trtp.SymInt32(1)
     else:
-        launch.grid_x = grid
-        launch.grid_y = 1
-        launch.grid_z = 1
+        launch.grid_x = _as_symint32(grid)
+        launch.grid_y = trtp.SymInt32(1)
+        launch.grid_z = trtp.SymInt32(1)
 
     launch.block_x = compiled.metadata.num_warps * 32
     launch.block_y = 1
diff --git a/py/torch_tensorrt/annotation/_custom_plugin/_qdp_utils.py b/py/torch_tensorrt/annotation/_custom_plugin/_qdp_utils.py
@@ -302,12 +302,18 @@ def _as_symint32(v: Any) -> Any:
         return trtp.SymInt32(1)
 
 
-def _safe_dim(d: Any, default: int = 256) -> int:
+def _safe_dim(d: Any, default: int = 1) -> int:
     """Extract a concrete int from a shape element safely.
 
     For TRT SymInt32 elements (dynamic shapes), calling int() directly does NOT raise
     but returns a garbage pointer-like value (~470 TB), causing OOM.  Check
     _int_expr.is_constant() first; return *default* for dynamic dims.
+
+    The default is 1 (minimum valid tensor dimension) rather than a larger value
+    so that dummy tensors constructed for kernel compilation use the most compact
+    shape.  CuTeDSL kernels bake the tensor layout into their type; using 1 for
+    dynamic dims gives a [1, static_dim, ...] dummy whose row-major offset formula
+    (offset = idx) is valid for any larger batch size at runtime.
     """
     if isinstance(d, int):
         return d
@@ -461,13 +467,21 @@ def collect_allowed_formats_for_io(
 
 
 def _collect_shape_var_bindings(shape_expr: Any, bindings: Dict[int, int]) -> None:
-    """Recursively find free/fake shape vars and assign them the minimum valid value.
+    """Recursively find free/fake/dynamic shape vars and assign them the minimum valid value.
 
     Walks *shape_expr* recursively (handling nested tensors with a `.shape_expr`
     attribute and plain list/tuple containers) and populates *bindings* with a
     mapping from ``id(var)`` → 1 for every element that is not a plain ``int``
-    and has ``is_fake == True``.  The value 1 is the minimum positive integer
-    accepted as a tensor dimension.
+    and is either:
+      - marked as fake (``is_fake == True``), or
+      - a non-constant symbolic expression (``is_constant == False``), or
+      - not directly convertible to int via ``int()``.
+
+    The value 1 is the minimum positive integer accepted as a tensor dimension.
+    Both ``is_fake`` fakes (from TRT's shape-inference placeholder pass) and
+    true dynamic ``ShapeExpr`` dims (from dynamic-shape engines) are bound so
+    that ``_shape_expr_to_ints`` can produce a concrete fallback shape for
+    ``meta_impl`` even in dynamic-shape contexts.
 
     Mutates *bindings* in place.
     """
@@ -476,10 +490,21 @@ def _collect_shape_var_bindings(shape_expr: Any, bindings: Dict[int, int]) -> No
             _collect_shape_var_bindings(d.shape_expr, bindings)
         elif isinstance(d, (list, tuple)):
             _collect_shape_var_bindings(d, bindings)
-        elif not isinstance(d, int) and getattr(d, "is_fake", False):
-            vid = id(d)
-            if vid not in bindings:
-                bindings[vid] = _MIN_VALID_DIM
+        elif not isinstance(d, int):
+            is_symbolic = (
+                getattr(d, "is_fake", False)
+                or (hasattr(d, "is_constant") and not d.is_constant)
+            )
+            if not is_symbolic:
+                # Last resort: try converting to int; if it fails, treat as symbolic.
+                try:
+                    int(d)
+                except (TypeError, ValueError):
+                    is_symbolic = True
+            if is_symbolic:
+                vid = id(d)
+                if vid not in bindings:
+                    bindings[vid] = _MIN_VALID_DIM
 
 
 def _shape_elem_to_int(d: Any, bindings: Dict[int, int]) -> int:
diff --git a/tests/py/annotation/e2e/test_custom_plugin_trt_plugins_e2e.py b/tests/py/annotation/e2e/test_custom_plugin_trt_plugins_e2e.py