Merge branch 'main' into pos_emb_on_npu

zhangtao0408 · web-flow · commit 25d2fd82aaca · 2026-01-06T10:44:26.000+08:00
diff --git a/examples/community/pipeline_z_image_differential_img2img.py b/examples/community/pipeline_z_image_differential_img2img.py
diff --git a/setup.py b/setup.py
@@ -274,7 +274,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.36.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.37.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="State-of-the-art diffusion in PyTorch and JAX.",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -162,7 +162,7 @@
         "default_subfolder": "transformer",
     },
     "QwenImageTransformer2DModel": {
-        "checkpoint_mapping_fn": lambda x: x,
+        "checkpoint_mapping_fn": lambda checkpoint, **kwargs: checkpoint,
         "default_subfolder": "transformer",
     },
     "Flux2Transformer2DModel": {
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -120,7 +120,10 @@
     "hunyuan-video": "txt_in.individual_token_refiner.blocks.0.adaLN_modulation.1.bias",
     "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight",
     "lumina2": ["model.diffusion_model.cap_embedder.0.weight", "cap_embedder.0.weight"],
-    "z-image-turbo": "cap_embedder.0.weight",
+    "z-image-turbo": [
+        "model.diffusion_model.layers.0.adaLN_modulation.0.weight",
+        "layers.0.adaLN_modulation.0.weight",
+    ],
     "z-image-turbo-controlnet": "control_all_x_embedder.2-1.weight",
     "z-image-turbo-controlnet-2.x": "control_layers.14.adaLN_modulation.0.weight",
     "sana": [
@@ -727,10 +730,7 @@ def infer_diffusers_model_type(checkpoint):
     ):
         model_type = "instruct-pix2pix"
 
-    elif (
-        CHECKPOINT_KEY_NAMES["z-image-turbo"] in checkpoint
-        and checkpoint[CHECKPOINT_KEY_NAMES["z-image-turbo"]].shape[0] == 2560
-    ):
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["z-image-turbo"]):
         model_type = "z-image-turbo"
 
     elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["lumina2"]):
@@ -3852,6 +3852,7 @@ def convert_z_image_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
         ".attention.k_norm.weight": ".attention.norm_k.weight",
         ".attention.q_norm.weight": ".attention.norm_q.weight",
         ".attention.out.weight": ".attention.to_out.0.weight",
+        "model.diffusion_model.": "",
     }
 
     def convert_z_image_fused_attention(key: str, state_dict: dict[str, object]) -> None:
@@ -3886,6 +3887,9 @@ def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str)
 
         update_state_dict(converted_state_dict, key, new_key)
 
+    if "norm_final.weight" in converted_state_dict.keys():
+        _ = converted_state_dict.pop("norm_final.weight")
+
     # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
     # special_keys_remap
     for key in list(converted_state_dict.keys()):
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
@@ -134,7 +134,8 @@ def apply_rotary_emb(
                 dropout_p=0.0,
                 is_causal=False,
                 backend=self._attention_backend,
-                parallel_config=self._parallel_config,
+                # Reference: https://github.com/huggingface/diffusers/pull/12909
+                parallel_config=None,
             )
             hidden_states_img = hidden_states_img.flatten(2, 3)
             hidden_states_img = hidden_states_img.type_as(query)
@@ -147,7 +148,8 @@ def apply_rotary_emb(
             dropout_p=0.0,
             is_causal=False,
             backend=self._attention_backend,
-            parallel_config=self._parallel_config,
+            # Reference: https://github.com/huggingface/diffusers/pull/12909
+            parallel_config=(self._parallel_config if encoder_hidden_states is None else None),
         )
         hidden_states = hidden_states.flatten(2, 3)
         hidden_states = hidden_states.type_as(query)
@@ -552,9 +554,11 @@ class WanTransformer3DModel(
         "blocks.0": {
             "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
         },
-        "blocks.*": {
-            "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
-        },
+        # Reference: https://github.com/huggingface/diffusers/pull/12909
+        # We need to disable the splitting of encoder_hidden_states because the image_encoder
+        # (Wan 2.1 I2V) consistently generates 257 tokens for image_embed. This causes the shape
+        # of encoder_hidden_states—whose token count is always 769 (512 + 257) after concatenation
+        # —to be indivisible by the number of devices in the CP.
         "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
         "": {
             "timestep": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False),
diff --git a/src/diffusers/models/transformers/transformer_wan_animate.py b/src/diffusers/models/transformers/transformer_wan_animate.py
@@ -609,7 +609,8 @@ def apply_rotary_emb(
                 dropout_p=0.0,
                 is_causal=False,
                 backend=self._attention_backend,
-                parallel_config=self._parallel_config,
+                # Reference: https://github.com/huggingface/diffusers/pull/12909
+                parallel_config=None,
             )
             hidden_states_img = hidden_states_img.flatten(2, 3)
             hidden_states_img = hidden_states_img.type_as(query)
@@ -622,7 +623,8 @@ def apply_rotary_emb(
             dropout_p=0.0,
             is_causal=False,
             backend=self._attention_backend,
-            parallel_config=self._parallel_config,
+            # Reference: https://github.com/huggingface/diffusers/pull/12909
+            parallel_config=(self._parallel_config if encoder_hidden_states is None else None),
         )
         hidden_states = hidden_states.flatten(2, 3)
         hidden_states = hidden_states.type_as(query)
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -76,7 +76,7 @@ def retrieve_latents(
 
         >>> model_id = "nvidia/Cosmos-Predict2.5-2B"
         >>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
-        ...     model_id, revision="diffusers/base/pre-trianed", torch_dtype=torch.bfloat16
+        ...     model_id, revision="diffusers/base/post-trained", torch_dtype=torch.bfloat16
         ... )
         >>> pipe = pipe.to("cuda")
 
diff --git a/src/diffusers/quantizers/torchao/torchao_quantizer.py b/src/diffusers/quantizers/torchao/torchao_quantizer.py
@@ -36,6 +36,9 @@
 from ..base import DiffusersQuantizer
 
 
+logger = logging.get_logger(__name__)
+
+
 if TYPE_CHECKING:
     from ...models.modeling_utils import ModelMixin
 
@@ -83,11 +86,19 @@ def _update_torch_safe_globals():
     ]
     try:
         from torchao.dtypes import NF4Tensor
-        from torchao.dtypes.floatx.float8_layout import Float8AQTTensorImpl
-        from torchao.dtypes.uintx.uint4_layout import UInt4Tensor
         from torchao.dtypes.uintx.uintx_layout import UintxAQTTensorImpl, UintxTensor
 
-        safe_globals.extend([UintxTensor, UInt4Tensor, UintxAQTTensorImpl, Float8AQTTensorImpl, NF4Tensor])
+        safe_globals.extend([UintxTensor, UintxAQTTensorImpl, NF4Tensor])
+
+        # note: is_torchao_version(">=", "0.16.0") does not work correctly
+        # with torchao nightly, so using a ">" check which does work correctly
+        if is_torchao_version(">", "0.15.0"):
+            pass
+        else:
+            from torchao.dtypes.floatx.float8_layout import Float8AQTTensorImpl
+            from torchao.dtypes.uintx.uint4_layout import UInt4Tensor
+
+            safe_globals.extend([UInt4Tensor, Float8AQTTensorImpl])
 
     except (ImportError, ModuleNotFoundError) as e:
         logger.warning(
@@ -123,9 +134,6 @@ def fuzzy_match_size(config_name: str) -> Optional[str]:
     return None
 
 
-logger = logging.get_logger(__name__)
-
-
 def _quantization_type(weight):
     from torchao.dtypes import AffineQuantizedTensor
     from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor