huggingface
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 10 additions & 2 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/diffusers/hooks/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/hooks/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/diffusers/hooks/text_kv_cache.py‎
Lines changed: 173 additions & 0 deletions b/‎src/diffusers/hooks/text_kv_cache.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎src/diffusers/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/cache_utils.py‎
Lines changed: 11 additions & 1 deletion b/‎src/diffusers/models/cache_utils.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/diffusers/models/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/models/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -169,22 +169,23 @@
             "PyramidAttentionBroadcastConfig",
             "SmoothedEnergyGuidanceConfig",
             "TaylorSeerCacheConfig",
+            "TextKVCacheConfig",
             "apply_faster_cache",
             "apply_first_block_cache",
             "apply_layer_skip",
             "apply_mag_cache",
             "apply_pyramid_attention_broadcast",
             "apply_taylorseer_cache",
+            "apply_text_kv_cache",
         ]
     )
     _import_structure["image_processor"] = [
-        "IPAdapterMaskProcessor",
         "InpaintProcessor",
+        "IPAdapterMaskProcessor",
         "PixArtImageProcessor",
         "VaeImageProcessor",
         "VaeImageProcessorLDM3D",
     ]
-    _import_structure["video_processor"] = ["VideoProcessor"]
     _import_structure["models"].extend(
         [
             "AllegroTransformer3DModel",
@@ -262,6 +263,7 @@
             "MotionAdapter",
             "MultiAdapter",
             "MultiControlNetModel",
+            "NucleusMoEImageTransformer2DModel",
             "OmniGenTransformer2DModel",
             "OvisImageTransformer2DModel",
             "ParallelConfig",
@@ -396,6 +398,7 @@
         ]
     )
     _import_structure["training_utils"] = ["EMAModel"]
+    _import_structure["video_processor"] = ["VideoProcessor"]
 
 try:
     if not (is_torch_available() and is_scipy_available()):
@@ -613,6 +616,7 @@
             "MarigoldNormalsPipeline",
             "MochiPipeline",
             "MusicLDMPipeline",
+            "NucleusMoEImagePipeline",
             "OmniGenPipeline",
             "OvisImagePipeline",
             "PaintByExamplePipeline",
@@ -967,12 +971,14 @@
             PyramidAttentionBroadcastConfig,
             SmoothedEnergyGuidanceConfig,
             TaylorSeerCacheConfig,
+            TextKVCacheConfig,
             apply_faster_cache,
             apply_first_block_cache,
             apply_layer_skip,
             apply_mag_cache,
             apply_pyramid_attention_broadcast,
             apply_taylorseer_cache,
+            apply_text_kv_cache,
         )
         from .image_processor import (
             InpaintProcessor,
@@ -1057,6 +1063,7 @@
             MotionAdapter,
             MultiAdapter,
             MultiControlNetModel,
+            NucleusMoEImageTransformer2DModel,
             OmniGenTransformer2DModel,
             OvisImageTransformer2DModel,
             ParallelConfig,
@@ -1384,6 +1391,7 @@
             MarigoldNormalsPipeline,
             MochiPipeline,
             MusicLDMPipeline,
+            NucleusMoEImagePipeline,
             OmniGenPipeline,
             OvisImagePipeline,
             PaintByExamplePipeline,
 
@@ -27,3 +27,4 @@
     from .pyramid_attention_broadcast import PyramidAttentionBroadcastConfig, apply_pyramid_attention_broadcast
     from .smoothed_energy_guidance_utils import SmoothedEnergyGuidanceConfig
     from .taylorseer_cache import TaylorSeerCacheConfig, apply_taylorseer_cache
+    from .text_kv_cache import TextKVCacheConfig, apply_text_kv_cache
@@ -0,0 +1,173 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import torch
+
+from .hooks import BaseState, HookRegistry, ModelHook, StateManager
+
+
+_TEXT_KV_CACHE_TRANSFORMER_HOOK = "text_kv_cache_transformer"
+_TEXT_KV_CACHE_BLOCK_HOOK = "text_kv_cache_block"
+
+
+@dataclass
+class TextKVCacheConfig:
+    """Enable exact (lossless) text K/V caching for transformer models.
+
+    Pre-computes per-block text key and value projections once before the denoising loop and reuses them across all
+    steps. Positive and negative prompts are distinguished via a stable cache key captured by a transformer-level hook
+    before any intermediate tensor allocations.
+    """
+
+    pass
+
+
+class TextKVCacheState(BaseState):
+    """Shared state between the transformer-level and block-level hooks.
+
+    The transformer hook writes the stable ``encoder_hidden_states`` ``data_ptr()`` (captured *before* ``txt_norm``) so
+    that block hooks can use it as a reliable cache key across denoising steps.
+    """
+
+    def __init__(self):
+        self.key: int | None = None
+
+    def reset(self):
+        self.key = None
+
+
+class TextKVCacheBlockState(BaseState):
+    """Per-block state holding cached text key/value projections."""
+
+    def __init__(self):
+        self.kv_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
+
+    def reset(self):
+        self.kv_cache.clear()
+
+
+class TextKVCacheTransformerHook(ModelHook):
+    """Captures ``encoder_hidden_states.data_ptr()`` before ``txt_norm``
+    and writes it to shared state for the block hooks to read."""
+
+    _is_stateful = True
+
+    def __init__(self, state_manager: StateManager):
+        super().__init__()
+        self.state_manager = state_manager
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        if self.state_manager._current_context is None:
+            self.state_manager.set_context("inference")
+
+        encoder_hidden_states = kwargs.get("encoder_hidden_states")
+        if encoder_hidden_states is not None:
+            state: TextKVCacheState = self.state_manager.get_state()
+            state.key = encoder_hidden_states.data_ptr()
+        return self.fn_ref.original_forward(*args, **kwargs)
+
+    def reset_state(self, module: torch.nn.Module):
+        self.state_manager.reset()
+        return module
+
+
+class TextKVCacheBlockHook(ModelHook):
+    """Caches ``(txt_key, txt_value)`` per block per unique prompt using
+    the stable cache key from the shared state."""
+
+    _is_stateful = True
+
+    def __init__(self, state_manager: StateManager, block_state_manager: StateManager):
+        super().__init__()
+        self.state_manager = state_manager
+        self.block_state_manager = block_state_manager
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        from ..models.transformers.transformer_nucleusmoe_image import _apply_rotary_emb_nucleus
+
+        if self.state_manager._current_context is None:
+            self.state_manager.set_context("inference")
+
+        if self.block_state_manager._current_context is None:
+            self.block_state_manager.set_context("inference")
+
+        if "encoder_hidden_states" in kwargs:
+            encoder_hidden_states = kwargs["encoder_hidden_states"]
+        else:
+            encoder_hidden_states = args[1]
+
+        if "image_rotary_emb" in kwargs:
+            image_rotary_emb = kwargs["image_rotary_emb"]
+        elif len(args) > 3:
+            image_rotary_emb = args[3]
+        else:
+            image_rotary_emb = None
+
+        state: TextKVCacheState = self.state_manager.get_state()
+        cache_key = state.key
+
+        block_state: TextKVCacheBlockState = self.block_state_manager.get_state()
+
+        if cache_key not in block_state.kv_cache:
+            context = module.encoder_proj(encoder_hidden_states)
+
+            attn = module.attn
+            head_dim = attn.inner_dim // attn.heads
+            num_kv_heads = attn.inner_kv_dim // head_dim
+
+            txt_key = attn.add_k_proj(context).unflatten(-1, (num_kv_heads, -1))
+            txt_value = attn.add_v_proj(context).unflatten(-1, (num_kv_heads, -1))
+
+            if attn.norm_added_k is not None:
+                txt_key = attn.norm_added_k(txt_key)
+
+            if image_rotary_emb is not None:
+                _, txt_freqs = image_rotary_emb
+                txt_key = _apply_rotary_emb_nucleus(txt_key, txt_freqs, use_real=False)
+
+            block_state.kv_cache[cache_key] = (txt_key, txt_value)
+
+        txt_key, txt_value = block_state.kv_cache[cache_key]
+
+        attn_kwargs = kwargs.get("attention_kwargs") or {}
+        attn_kwargs["cached_txt_key"] = txt_key
+        attn_kwargs["cached_txt_value"] = txt_value
+        kwargs["attention_kwargs"] = attn_kwargs
+
+        return self.fn_ref.original_forward(*args, **kwargs)
+
+    def reset_state(self, module: torch.nn.Module):
+        self.block_state_manager.reset()
+        return module
+
+
+def apply_text_kv_cache(module: torch.nn.Module, config: TextKVCacheConfig) -> None:
+    from ..models.transformers.transformer_nucleusmoe_image import NucleusMoEImageTransformerBlock
+
+    HookRegistry.check_if_exists_or_initialize(module)
+
+    state_manager = StateManager(TextKVCacheState)
+
+    transformer_hook = TextKVCacheTransformerHook(state_manager)
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry.register_hook(transformer_hook, _TEXT_KV_CACHE_TRANSFORMER_HOOK)
+
+    for _, submodule in module.named_modules():
+        if isinstance(submodule, NucleusMoEImageTransformerBlock):
+            block_state_manager = StateManager(TextKVCacheBlockState)
+            hook = TextKVCacheBlockHook(state_manager, block_state_manager)
+            block_registry = HookRegistry.check_if_exists_or_initialize(submodule)
+            block_registry.register_hook(hook, _TEXT_KV_CACHE_BLOCK_HOOK)
@@ -116,6 +116,7 @@
     _import_structure["transformers.transformer_ltx2"] = ["LTX2VideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
+    _import_structure["transformers.transformer_nucleusmoe_image"] = ["NucleusMoEImageTransformer2DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
     _import_structure["transformers.transformer_ovis_image"] = ["OvisImageTransformer2DModel"]
     _import_structure["transformers.transformer_prx"] = ["PRXTransformer2DModel"]
@@ -236,6 +237,7 @@
             Lumina2Transformer2DModel,
             LuminaNextDiT2DModel,
             MochiTransformer3DModel,
+            NucleusMoEImageTransformer2DModel,
             OmniGenTransformer2DModel,
             OvisImageTransformer2DModel,
             PixArtTransformer2DModel,
 
@@ -41,11 +41,12 @@ def enable_cache(self, config) -> None:
         Enable caching techniques on the model.
 
         Args:
-            config (`PyramidAttentionBroadcastConfig | FasterCacheConfig | FirstBlockCacheConfig`):
+            config (`PyramidAttentionBroadcastConfig | FasterCacheConfig | FirstBlockCacheConfig | TextKVCacheConfig`):
                 The configuration for applying the caching technique. Currently supported caching techniques are:
                     - [`~hooks.PyramidAttentionBroadcastConfig`]
                     - [`~hooks.FasterCacheConfig`]
                     - [`~hooks.FirstBlockCacheConfig`]
+                    - [`~hooks.TextKVCacheConfig`]
 
         Example:
 
@@ -71,11 +72,13 @@ def enable_cache(self, config) -> None:
             MagCacheConfig,
             PyramidAttentionBroadcastConfig,
             TaylorSeerCacheConfig,
+            TextKVCacheConfig,
             apply_faster_cache,
             apply_first_block_cache,
             apply_mag_cache,
             apply_pyramid_attention_broadcast,
             apply_taylorseer_cache,
+            apply_text_kv_cache,
         )
 
         if self.is_cache_enabled:
@@ -89,6 +92,8 @@ def enable_cache(self, config) -> None:
             apply_first_block_cache(self, config)
         elif isinstance(config, MagCacheConfig):
             apply_mag_cache(self, config)
+        elif isinstance(config, TextKVCacheConfig):
+            apply_text_kv_cache(self, config)
         elif isinstance(config, PyramidAttentionBroadcastConfig):
             apply_pyramid_attention_broadcast(self, config)
         elif isinstance(config, TaylorSeerCacheConfig):
@@ -106,12 +111,14 @@ def disable_cache(self) -> None:
             MagCacheConfig,
             PyramidAttentionBroadcastConfig,
             TaylorSeerCacheConfig,
+            TextKVCacheConfig,
         )
         from ..hooks.faster_cache import _FASTER_CACHE_BLOCK_HOOK, _FASTER_CACHE_DENOISER_HOOK
         from ..hooks.first_block_cache import _FBC_BLOCK_HOOK, _FBC_LEADER_BLOCK_HOOK
         from ..hooks.mag_cache import _MAG_CACHE_BLOCK_HOOK, _MAG_CACHE_LEADER_BLOCK_HOOK
         from ..hooks.pyramid_attention_broadcast import _PYRAMID_ATTENTION_BROADCAST_HOOK
         from ..hooks.taylorseer_cache import _TAYLORSEER_CACHE_HOOK
+        from ..hooks.text_kv_cache import _TEXT_KV_CACHE_BLOCK_HOOK, _TEXT_KV_CACHE_TRANSFORMER_HOOK
 
         if self._cache_config is None:
             logger.warning("Caching techniques have not been enabled, so there's nothing to disable.")
@@ -129,6 +136,9 @@ def disable_cache(self) -> None:
             registry.remove_hook(_MAG_CACHE_BLOCK_HOOK, recurse=True)
         elif isinstance(self._cache_config, PyramidAttentionBroadcastConfig):
             registry.remove_hook(_PYRAMID_ATTENTION_BROADCAST_HOOK, recurse=True)
+        elif isinstance(self._cache_config, TextKVCacheConfig):
+            registry.remove_hook(_TEXT_KV_CACHE_TRANSFORMER_HOOK, recurse=True)
+            registry.remove_hook(_TEXT_KV_CACHE_BLOCK_HOOK, recurse=True)
         elif isinstance(self._cache_config, TaylorSeerCacheConfig):
             registry.remove_hook(_TAYLORSEER_CACHE_HOOK, recurse=True)
         else:
 
@@ -40,6 +40,7 @@
     from .transformer_ltx2 import LTX2VideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
+    from .transformer_nucleusmoe_image import NucleusMoEImageTransformer2DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
     from .transformer_ovis_image import OvisImageTransformer2DModel
     from .transformer_prx import PRXTransformer2DModel