teacache for previously disabled pipelines; update examples

o-stoner · o-stoner · commit 37625ce741e8 · 2026-04-17T23:35:43.000Z
Signed-off-by: Olivia Stoner &lt;245287810+o-stoner@users.noreply.github.com&gt;
diff --git a/docs/source/models/visual-generation.md b/docs/source/models/visual-generation.md
@@ -41,11 +41,15 @@ Models are auto-detected from the checkpoint directory. Diffusers-format models
 | **FLUX.1** | Yes | Yes | Yes | No [^1] | Yes | No | Yes | Yes | Yes |
 | **FLUX.2** | Yes | Yes | Yes | No [^1] | Yes | No | Yes | Yes | Yes |
 | **Wan 2.1** | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
-| **Wan 2.2** | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes |
-| **LTX-2** | Yes | Yes | No | Yes | Yes | No | No | Yes | Yes |
+| **Wan 2.2** | Yes | Yes | Yes [^2] | Yes | Yes | Yes | Yes | Yes | Yes |
+| **LTX-2** | Yes | Yes | Yes [^3] | Yes | Yes | No | No | Yes | Yes |
 
 [^1]: FLUX models use embedded guidance and do not have a separate negative prompt path, so CFG parallelism is not applicable.
 
+[^2]: Wan 2.2 has two stage transformers; TeaCache requires explicit `teacache.coefficients` (high-noise) and `teacache.coefficients_2` (low-noise). There is no built-in coefficient table for Wan 2.2.
+
+[^3]: LTX-2 has no built-in TeaCache coefficient table in TRT-LLM; set `teacache.coefficients` explicitly when enabling TeaCache.
+
 ## Quick Start
 
 Here is a simple example to generate a video with Wan 2.1:
@@ -109,7 +113,7 @@ args = VisualGenArgs(
 
 ### TeaCache
 
-TeaCache caches transformer outputs when timestep embeddings change slowly between denoising steps, skipping redundant computation. Enable with `teacache.enable_teacache: true` (YAML config). The `teacache_thresh` parameter controls the similarity threshold.
+TeaCache caches transformer outputs when timestep embeddings change slowly between denoising steps, skipping redundant computation. Enable with `teacache.enable_teacache: true` (YAML config). The `teacache_thresh` parameter controls the similarity threshold. For Wan 2.2, set both `coefficients` and `coefficients_2` (YAML or CLI). For LTX-2, set `coefficients` when enabling TeaCache (no built-in table). Other models (e.g. FLUX.1, FLUX.2, Wan 2.1) can omit `coefficients` to use the built-in checkpoint table.
 
 ### Multi-GPU Parallelism
 
diff --git a/examples/visual_gen/README.md b/examples/visual_gen/README.md
@@ -217,6 +217,8 @@ python visual_gen_ltx2.py \
 | `--image_cond_strength` | — | ✓ | 1.0 | Image conditioning strength |
 | `--enable_teacache` | ✓ | ✓ | — | False | Cache optimization |
 | `--teacache_thresh` | ✓ | ✓ | — | 0.2 | TeaCache similarity threshold |
+| `--teacache_coefficients` | ✓ | ✓ | — | *(omit)* | Optional polynomial coeffs; overrides built-in table |
+| `--use_ret_steps` | ✓ | ✓ | — | False | TeaCache retention-steps mode (WAN/FLUX tables) |
 | `--attention_backend` | ✓ | ✓ | — | VANILLA | `VANILLA`, `TRTLLM`, or `FA4` |
 | `--cfg_size` | — | ✓ | — | 1 | CFG parallelism |
 | `--ulysses_size` | ✓ | ✓ | — | 1 | Sequence parallelism |
diff --git a/examples/visual_gen/serve/README.md b/examples/visual_gen/serve/README.md
@@ -53,6 +53,8 @@ Before running these examples, ensure you have:
    ```
    For LTX-2, you need to provide a proper text_encoder_path in `./configs/ltx2.yml`.
 
+   **TeaCache:** Example YAML files set `enable_teacache` and `teacache_thresh` only. Omit `coefficients` to use each pipeline’s **built-in** coefficient table (checkpoint path matching). Add `coefficients: [ ... ]` under `teacache` only when you need to override those defaults.
+
 ## Examples
 
 Current supported & tested models:
diff --git a/examples/visual_gen/visual_gen_flux.py b/examples/visual_gen/visual_gen_flux.py
@@ -123,6 +123,17 @@ def parse_args():
         help="Use ret_steps mode for TeaCache. "
         "Using Retention Steps will result in faster generation speed and better generation quality.",
     )
+    parser.add_argument(
+        "--teacache_coefficients",
+        nargs="+",
+        type=float,
+        default=None,
+        metavar="FLOAT",
+        help=(
+            "Optional TeaCache polynomial coefficients (overrides checkpoint table). "
+            "Example: --teacache_coefficients 1.0 0.0 0.5"
+        ),
+    )
 
     # Quantization
     parser.add_argument(
@@ -222,6 +233,11 @@ def build_diffusion_args(args) -> VisualGenArgs:
                 else {}
             ),
             "use_ret_steps": args.use_ret_steps,
+            **(
+                {"coefficients": list(args.teacache_coefficients)}
+                if args.teacache_coefficients is not None
+                else {}
+            ),
         },
         parallel={
             "dit_ulysses_size": args.ulysses_size,
diff --git a/examples/visual_gen/visual_gen_wan_i2v.py b/examples/visual_gen/visual_gen_wan_i2v.py
@@ -97,6 +97,29 @@ def parse_args():
         help="Use ret_steps mode for TeaCache. "
         "Using Retention Steps will result in faster generation speed and better generation quality.",
     )
+    parser.add_argument(
+        "--teacache_coefficients",
+        nargs="+",
+        type=float,
+        default=None,
+        metavar="FLOAT",
+        help=(
+            "Optional TeaCache polynomial coefficients (overrides checkpoint table). "
+            "Example: --teacache_coefficients 1.0 0.0 0.5"
+        ),
+    )
+    parser.add_argument(
+        "--teacache_coefficients_2",
+        nargs="+",
+        type=float,
+        default=None,
+        metavar="FLOAT",
+        help=(
+            "Second polynomial for Wan 2.2 low-noise transformer_2 (requires "
+            "--teacache_coefficients for the high-noise transformer). "
+            "Ignored for Wan 2.1."
+        ),
+    )
 
     # Quantization
     parser.add_argument(
@@ -182,6 +205,16 @@ def main():
             "enable_teacache": args.enable_teacache,
             "teacache_thresh": args.teacache_thresh,
             "use_ret_steps": args.use_ret_steps,
+            **(
+                {"coefficients": list(args.teacache_coefficients)}
+                if args.teacache_coefficients is not None
+                else {}
+            ),
+            **(
+                {"coefficients_2": list(args.teacache_coefficients_2)}
+                if args.teacache_coefficients_2 is not None
+                else {}
+            ),
         },
         parallel={
             "dit_cfg_size": args.cfg_size,
diff --git a/examples/visual_gen/visual_gen_wan_t2v.py b/examples/visual_gen/visual_gen_wan_t2v.py
@@ -91,6 +91,29 @@ def parse_args():
         help="Use ret_steps mode for TeaCache. "
         "Using Retention Steps will result in faster generation speed and better generation quality.",
     )
+    parser.add_argument(
+        "--teacache_coefficients",
+        nargs="+",
+        type=float,
+        default=None,
+        metavar="FLOAT",
+        help=(
+            "Optional TeaCache polynomial coefficients (overrides checkpoint table). "
+            "Example: --teacache_coefficients 1.0 0.0 0.5"
+        ),
+    )
+    parser.add_argument(
+        "--teacache_coefficients_2",
+        nargs="+",
+        type=float,
+        default=None,
+        metavar="FLOAT",
+        help=(
+            "Second polynomial for Wan 2.2 low-noise transformer_2 (requires "
+            "--teacache_coefficients for the high-noise transformer). "
+            "Ignored for Wan 2.1."
+        ),
+    )
 
     # Quantization
     parser.add_argument(
@@ -191,6 +214,16 @@ def main():
             "enable_teacache": args.enable_teacache,
             "teacache_thresh": args.teacache_thresh,
             "use_ret_steps": args.use_ret_steps,
+            **(
+                {"coefficients": list(args.teacache_coefficients)}
+                if args.teacache_coefficients is not None
+                else {}
+            ),
+            **(
+                {"coefficients_2": list(args.teacache_coefficients_2)}
+                if args.teacache_coefficients_2 is not None
+                else {}
+            ),
         },
         parallel={
             "dit_cfg_size": args.cfg_size,
diff --git a/tensorrt_llm/_torch/visual_gen/config.py b/tensorrt_llm/_torch/visual_gen/config.py
@@ -163,6 +163,8 @@ class TeaCacheConfig(StrictBaseModel):
                      Applied as: rescaled_distance = poly(raw_distance).
                      None means use the pipeline built-in coefficient table (checkpoint path
                      matching). A non-None list overrides that table.
+        coefficients_2: Second polynomial (Wan 2.2 dual-transformer low-noise stage only).
+                     Required together with coefficients when enabling TeaCache on Wan 2.2.
         ret_steps: Number of warmup steps (always compute, initialized at runtime)
         cutoff_steps: Step to stop caching (always compute after, initialized at runtime)
         num_steps: Total inference steps (set at runtime)
@@ -174,6 +176,7 @@ class TeaCacheConfig(StrictBaseModel):
     use_ret_steps: bool = False
 
     coefficients: Optional[List[float]] = None
+    coefficients_2: Optional[List[float]] = None
 
     # Runtime state fields (initialized by TeaCacheBackend.refresh)
     ret_steps: Optional[int] = None
@@ -191,6 +194,8 @@ def validate_teacache(self) -> "TeaCacheConfig":
         # Validate coefficients (when provided)
         if self.coefficients is not None and len(self.coefficients) == 0:
             raise ValueError("TeaCache coefficients list cannot be empty")
+        if self.coefficients_2 is not None and len(self.coefficients_2) == 0:
+            raise ValueError("TeaCache coefficients_2 list cannot be empty")
 
         # Validate ret_steps if set
         if self.ret_steps is not None and self.ret_steps < 0:
diff --git a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py
@@ -290,7 +290,7 @@ def load_weights(self, weights: dict) -> None:
     def post_load_weights(self) -> None:
         """Post-load setup: TeaCache registration."""
         super().post_load_weights()
-        if self.transformer is not None:
+        if self.transformer is not None and self.model_config.teacache.enable_teacache:
             # Register TeaCache extractor for FLUX.2 (must be after device placement)
             # Only set guidance_param_name for variants with guidance_embeds
             guidance_param = "guidance" if self.transformer.guidance_embeds else None
@@ -313,7 +313,6 @@ def post_load_weights(self) -> None:
                 )
             )
 
-            # Enable TeaCache with FLUX.2-specific polynomial coefficients
             self._setup_teacache(self.transformer, FLUX2_TEACACHE_COEFFICIENTS)
 
     def infer(self, req):
diff --git a/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py b/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py
@@ -17,7 +17,7 @@
 from tensorrt_llm._torch.visual_gen.output import MediaOutput
 from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline
 from tensorrt_llm._torch.visual_gen.pipeline_registry import register_pipeline
-from tensorrt_llm._torch.visual_gen.teacache import CacheContext
+from tensorrt_llm._torch.visual_gen.teacache import CacheContext, register_extractor
 from tensorrt_llm._torch.visual_gen.utils import postprocess_video_tensor
 from tensorrt_llm.logger import logger
 
@@ -592,13 +592,18 @@ def post_load_weights(self) -> None:
         """Finalize after weight loading: TeaCache, derived attributes."""
         super().post_load_weights()
 
-        # TODO: TeaCache disabled: LTX2_TEACACHE_COEFFICIENTS are unverified.
-        # To re-enable, uncomment the following lines and verify coefficients.
-        # register_extractor(
-        #     "LTXModel",
-        #     LTX2TeaCacheExtractor(self._compute_ltx2_timestep_embedding),
-        # )
-        # self._setup_teacache(self.transformer, coefficients=LTX2_TEACACHE_COEFFICIENTS)
+        # LTX-2: single transformer (one DiT for video+audio); TeaCache only with explicit coefficients.
+        if self.transformer is not None and self.model_config.teacache.enable_teacache:
+            if self.model_config.teacache.coefficients is None:
+                raise ValueError(
+                    "TeaCache on LTX-2 requires explicit teacache.coefficients "
+                    "(no built-in coefficient table)."
+                )
+            register_extractor(
+                "LTXModel",
+                LTX2TeaCacheExtractor(self._compute_ltx2_timestep_embedding),
+            )
+            self._setup_teacache(self.transformer, coefficients=None)
 
         # Compression ratios from native scale factors
         self.vae_spatial_compression_ratio = VIDEO_SCALE_FACTORS.width
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py
@@ -12,7 +12,11 @@
 from tensorrt_llm._torch.visual_gen.output import MediaOutput
 from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline
 from tensorrt_llm._torch.visual_gen.pipeline_registry import register_pipeline
-from tensorrt_llm._torch.visual_gen.teacache import ExtractorConfig, register_extractor_from_config
+from tensorrt_llm._torch.visual_gen.teacache import (
+    ExtractorConfig,
+    TeaCacheBackend,
+    register_extractor_from_config,
+)
 from tensorrt_llm._torch.visual_gen.utils import postprocess_video_tensor
 from tensorrt_llm._utils import nvtx_range
 from tensorrt_llm.logger import logger
@@ -77,13 +81,6 @@ def __init__(self, model_config):
         self.boundary_ratio = getattr(model_config.pretrained_config, "boundary_ratio", None)
         self.is_wan22 = self.boundary_ratio is not None
 
-        # Validate TeaCache compatibility before allocating GPU memory
-        if self.is_wan22 and model_config.teacache.enable_teacache:
-            raise ValueError(
-                "TeaCache is not supported for Wan 2.2 T2V models. "
-                "Set enable_teacache=False in TeaCacheConfig."
-            )
-
         super().__init__(model_config)
 
     def _compute_wan_timestep_embedding(self, module, timestep=None, **kwargs):
@@ -277,16 +274,41 @@ def post_load_weights(self) -> None:
             if not self.is_wan22:
                 self._setup_teacache(self.transformer, coefficients=WAN_TEACACHE_COEFFICIENTS)
                 self.transformer_cache_backend = self.cache_backend
-            else:
-                # TeaCache is not supported for Wan 2.2: the dual-transformer
-                # architecture (transformer + transformer_2) requires separate
-                # TeaCache coefficients that have not been calibrated yet.
-                self.transformer_cache_backend = None
 
         if self.transformer_2 is not None:
             if hasattr(self.transformer_2, "post_load_weights"):
                 self.transformer_2.post_load_weights()
 
+        # Wan 2.2 TeaCache after both transformers' post_load_weights (FP8 scales, etc.)
+        if (
+            self.transformer is not None
+            and self.transformer_2 is not None
+            and self.is_wan22
+            and self.model_config.teacache.enable_teacache
+        ):
+            self._apply_teacache_coefficients(WAN_TEACACHE_COEFFICIENTS)
+            tc = self.model_config.teacache
+            if tc.coefficients is None or tc.coefficients_2 is None:
+                raise ValueError(
+                    "Wan 2.2 TeaCache requires explicit teacache.coefficients and "
+                    "teacache.coefficients_2 (high-noise and low-noise stage polynomials). "
+                    "There is no built-in coefficient table for Wan 2.2."
+                )
+            cfg_high = tc.model_copy(deep=True)
+            cfg_low = tc.model_copy(deep=True)
+            cfg_low.coefficients = tc.coefficients_2
+            logger.info("TeaCache: Initializing (Wan 2.2 high-noise transformer)...")
+            self.cache_backend = TeaCacheBackend(cfg_high)
+            self.cache_backend.enable(self.transformer)
+            self.transformer_cache_backend = self.cache_backend
+            logger.info("TeaCache: Initializing (Wan 2.2 low-noise transformer_2)...")
+            self.transformer_2_cache_backend = TeaCacheBackend(cfg_low)
+            self.transformer_2_cache_backend.enable(self.transformer_2)
+            self._teacache_backends = [
+                self.cache_backend,
+                self.transformer_2_cache_backend,
+            ]
+
     def _run_warmup(self, height: int, width: int, num_frames: int, steps: int) -> None:
         with torch.no_grad():
             self.forward(
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py
@@ -15,7 +15,11 @@
 from tensorrt_llm._torch.visual_gen.output import MediaOutput
 from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline
 from tensorrt_llm._torch.visual_gen.pipeline_registry import register_pipeline
-from tensorrt_llm._torch.visual_gen.teacache import ExtractorConfig, register_extractor_from_config
+from tensorrt_llm._torch.visual_gen.teacache import (
+    ExtractorConfig,
+    TeaCacheBackend,
+    register_extractor_from_config,
+)
 from tensorrt_llm._torch.visual_gen.utils import postprocess_video_tensor
 from tensorrt_llm.logger import logger
 
@@ -98,13 +102,6 @@ def __init__(self, model_config):
         self.boundary_ratio = getattr(model_config.pretrained_config, "boundary_ratio", None)
         self.is_wan22 = self.boundary_ratio is not None
 
-        # Validate TeaCache compatibility before allocating GPU memory
-        if self.is_wan22 and model_config.teacache.enable_teacache:
-            raise ValueError(
-                "TeaCache is not supported for Wan 2.2 models. "
-                "Set enable_teacache=False in TeaCacheConfig."
-            )
-
         super().__init__(model_config)
 
     def _compute_wan_timestep_embedding(self, module, timestep=None, **kwargs):
@@ -337,16 +334,40 @@ def post_load_weights(self) -> None:
             if not self.is_wan22:
                 self._setup_teacache(self.transformer, coefficients=WAN_I2V_TEACACHE_COEFFICIENTS)
                 self.transformer_cache_backend = self.cache_backend
-            else:
-                # TeaCache is not supported for Wan 2.2: the dual-transformer
-                # architecture (transformer + transformer_2) requires separate
-                # TeaCache coefficients that have not been calibrated yet.
-                self.transformer_cache_backend = None
 
         if self.transformer_2 is not None:
             if hasattr(self.transformer_2, "post_load_weights"):
                 self.transformer_2.post_load_weights()
 
+        if (
+            self.transformer is not None
+            and self.transformer_2 is not None
+            and self.is_wan22
+            and self.model_config.teacache.enable_teacache
+        ):
+            self._apply_teacache_coefficients(WAN_I2V_TEACACHE_COEFFICIENTS)
+            tc = self.model_config.teacache
+            if tc.coefficients is None or tc.coefficients_2 is None:
+                raise ValueError(
+                    "Wan 2.2 TeaCache requires explicit teacache.coefficients and "
+                    "teacache.coefficients_2 (high-noise and low-noise stage polynomials). "
+                    "There is no built-in coefficient table for Wan 2.2."
+                )
+            cfg_high = tc.model_copy(deep=True)
+            cfg_low = tc.model_copy(deep=True)
+            cfg_low.coefficients = tc.coefficients_2
+            logger.info("TeaCache: Initializing (Wan 2.2 I2V high-noise transformer)...")
+            self.cache_backend = TeaCacheBackend(cfg_high)
+            self.cache_backend.enable(self.transformer)
+            self.transformer_cache_backend = self.cache_backend
+            logger.info("TeaCache: Initializing (Wan 2.2 I2V low-noise transformer_2)...")
+            self.transformer_2_cache_backend = TeaCacheBackend(cfg_low)
+            self.transformer_2_cache_backend.enable(self.transformer_2)
+            self._teacache_backends = [
+                self.cache_backend,
+                self.transformer_2_cache_backend,
+            ]
+
     def _run_warmup(self, height: int, width: int, num_frames: int, steps: int) -> None:
         dummy_image = PIL.Image.new("RGB", (width, height))
         with torch.no_grad():
diff --git a/tensorrt_llm/_torch/visual_gen/pipeline.py b/tensorrt_llm/_torch/visual_gen/pipeline.py
diff --git a/tests/unittest/_torch/visual_gen/test_teacache.py b/tests/unittest/_torch/visual_gen/test_teacache.py

Original file line number	Diff line number	Diff line change
`@@ -290,7 +290,7 @@ def load_weights(self, weights: dict) -> None:`
`290`	`290`	`def post_load_weights(self) -> None:`
`291`	`291`	`"""Post-load setup: TeaCache registration."""`
`292`	`292`	`super().post_load_weights()`
`293`		`- if self.transformer is not None:`
	`293`	`+ if self.transformer is not None and self.model_config.teacache.enable_teacache:`
`294`	`294`	`# Register TeaCache extractor for FLUX.2 (must be after device placement)`
`295`	`295`	`# Only set guidance_param_name for variants with guidance_embeds`
`296`	`296`	`guidance_param = "guidance" if self.transformer.guidance_embeds else None`
`@@ -313,7 +313,6 @@ def post_load_weights(self) -> None:`
`313`	`313`	`)`
`314`	`314`	`)`
`315`	`315`
`316`		`- # Enable TeaCache with FLUX.2-specific polynomial coefficients`
`317`	`316`	`self._setup_teacache(self.transformer, FLUX2_TEACACHE_COEFFICIENTS)`
`318`	`317`
`319`	`318`	`def infer(self, req):`