huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 5 additions & 1 deletion b/‎docs/source/en/_toctree.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/en/api/models/longcat_image_transformer2d.md‎
Lines changed: 25 additions & 0 deletions b/‎docs/source/en/api/models/longcat_image_transformer2d.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/longcat_image.md‎
Lines changed: 114 additions & 0 deletions b/‎docs/source/en/api/pipelines/longcat_image.md‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/single_file_model.py‎
Lines changed: 9 additions & 1 deletion b/‎src/diffusers/loaders/single_file_model.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/diffusers/loaders/single_file_utils.py‎
Lines changed: 24 additions & 0 deletions b/‎src/diffusers/loaders/single_file_utils.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/diffusers/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -365,6 +365,8 @@
         title: HunyuanVideoTransformer3DModel
       - local: api/models/latte_transformer3d
         title: LatteTransformer3DModel
+      - local: api/models/longcat_image_transformer2d
+        title: LongCatImageTransformer2DModel
       - local: api/models/ltx_video_transformer3d
         title: LTXVideoTransformer3DModel
       - local: api/models/lumina2_transformer2d
@@ -402,7 +404,7 @@
       - local: api/models/wan_transformer_3d
         title: WanTransformer3DModel
       - local: api/models/z_image_transformer2d
-        title: ZImageTransformer2DModel        
+        title: ZImageTransformer2DModel
       title: Transformers
     - sections:
       - local: api/models/stable_cascade_unet
@@ -563,6 +565,8 @@
         title: Latent Diffusion
       - local: api/pipelines/ledits_pp
         title: LEDITS++
+      - local: api/pipelines/longcat_image
+        title: LongCat-Image
       - local: api/pipelines/lumina2
         title: Lumina 2.0
       - local: api/pipelines/lumina
 
@@ -0,0 +1,25 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LongCatImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import LongCatImageTransformer2DModel
+
+transformer = LongCatImageTransformer2DModel.from_pretrained("meituan-longcat/LongCat-Image ", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## LongCatImageTransformer2DModel
+
+[[autodoc]] LongCatImageTransformer2DModel
@@ -0,0 +1,114 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LongCat-Image
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+
+We introduce LongCat-Image, a pioneering open-source and bilingual (Chinese-English) foundation model for image generation, designed to address core challenges in multilingual text rendering, photorealism, deployment efficiency, and developer accessibility prevalent in current leading models.
+
+
+### Key Features
+- 🌟 **Exceptional Efficiency and Performance**: With only **6B parameters**, LongCat-Image surpasses numerous open-source models that are several times larger across multiple benchmarks, demonstrating the immense potential of efficient model design.
+- 🌟 **Superior Editing Performance**: LongCat-Image-Edit model achieves state-of-the-art performance among open-source models, delivering leading instruction-following and image quality with superior visual consistency.
+- 🌟 **Powerful Chinese Text Rendering**: LongCat-Image demonstrates superior accuracy and stability in rendering common Chinese characters compared to existing SOTA open-source models and achieves industry-leading coverage of the Chinese dictionary.
+- 🌟 **Remarkable Photorealism**: Through an innovative data strategy and training framework, LongCat-Image achieves remarkable photorealism in generated images.
+- 🌟 **Comprehensive Open-Source Ecosystem**: We provide a complete toolchain, from intermediate checkpoints to full training code, significantly lowering the barrier for further research and development.
+
+For more details, please refer to the comprehensive [***LongCat-Image Technical Report***](https://arxiv.org/abs/2412.11963)
+
+
+## Usage Example
+
+```py
+import torch
+import diffusers
+from diffusers import LongCatImagePipeline
+
+weight_dtype = torch.bfloat16
+pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16 )
+pipe.to('cuda')
+# pipe.enable_model_cpu_offload()
+
+prompt = '一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。镜头采用中距离视角，突出她的神态和服饰的细节。光线柔和地打在她的脸上，强调她的五官和饰品的质感，增加画面的层次感与亲和力。整个画面构图简洁，砖墙的纹理与阳光的光影效果相得益彰，突显出人物的优雅与从容。'
+image = pipe(
+    prompt,
+    height=768,
+    width=1344,
+    guidance_scale=4.0,
+    num_inference_steps=50,
+    num_images_per_prompt=1,
+    generator=torch.Generator("cpu").manual_seed(43),
+    enable_cfg_renorm=True,
+    enable_prompt_rewrite=True,
+).images[0]
+image.save(f'./longcat_image_t2i_example.png')
+```
+
+
+This pipeline was contributed by LongCat-Image Team. The original codebase can be found [here](https://github.com/meituan-longcat/LongCat-Image).
+
+Available models:
+<div style="overflow-x: auto; margin-bottom: 16px;">
+  <table style="border-collapse: collapse; width: 100%;">
+    <thead>
+      <tr>
+        <th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Models</th>
+        <th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Type</th>
+        <th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Description</th>
+        <th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Download Link</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image</td>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">Final Release. The standard model for out&#8209;of&#8209;the&#8209;box inference.</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">
+          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image">Huggingface</a></span>
+        </td>
+      </tr>
+      <tr>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Dev</td>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">Development. Mid-training checkpoint, suitable for fine-tuning.</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">
+          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Dev">Huggingface</a></span>
+        </td>
+      </tr>
+      <tr>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Edit</td>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Image Editing</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">Specialized model for image editing.</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">
+          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Edit">Huggingface</a></span>
+        </td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+
+## LongCatImagePipeline
+
+[[autodoc]] LongCatImagePipeline
+- all
+- __call__
+
+## LongCatImagePipelineOutput
+
+[[autodoc]] pipelines.longcat_image.pipeline_output.LongCatImagePipelineOutput
+
+
+
@@ -235,6 +235,7 @@
             "Kandinsky3UNet",
             "Kandinsky5Transformer3DModel",
             "LatteTransformer3DModel",
+            "LongCatImageTransformer2DModel",
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
             "LuminaNextDiT2DModel",
@@ -278,6 +279,7 @@
             "WanAnimateTransformer3DModel",
             "WanTransformer3DModel",
             "WanVACETransformer3DModel",
+            "ZImageControlNetModel",
             "ZImageTransformer2DModel",
             "attention_backend",
         ]
@@ -532,6 +534,8 @@
             "LDMTextToImagePipeline",
             "LEditsPPPipelineStableDiffusion",
             "LEditsPPPipelineStableDiffusionXL",
+            "LongCatImageEditPipeline",
+            "LongCatImagePipeline",
             "LTXConditionPipeline",
             "LTXImageToVideoPipeline",
             "LTXLatentUpsamplePipeline",
@@ -561,6 +565,7 @@
             "QwenImageEditPlusPipeline",
             "QwenImageImg2ImgPipeline",
             "QwenImageInpaintPipeline",
+            "QwenImageLayeredPipeline",
             "QwenImagePipeline",
             "ReduxImageEncoder",
             "SanaControlNetPipeline",
@@ -666,6 +671,8 @@
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
+            "ZImageControlNetInpaintPipeline",
+            "ZImageControlNetPipeline",
             "ZImageImg2ImgPipeline",
             "ZImagePipeline",
         ]
@@ -970,6 +977,7 @@
             Kandinsky3UNet,
             Kandinsky5Transformer3DModel,
             LatteTransformer3DModel,
+            LongCatImageTransformer2DModel,
             LTXVideoTransformer3DModel,
             Lumina2Transformer2DModel,
             LuminaNextDiT2DModel,
@@ -1012,6 +1020,7 @@
             WanAnimateTransformer3DModel,
             WanTransformer3DModel,
             WanVACETransformer3DModel,
+            ZImageControlNetModel,
             ZImageTransformer2DModel,
             attention_backend,
         )
@@ -1237,6 +1246,8 @@
             LDMTextToImagePipeline,
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
+            LongCatImageEditPipeline,
+            LongCatImagePipeline,
             LTXConditionPipeline,
             LTXImageToVideoPipeline,
             LTXLatentUpsamplePipeline,
@@ -1266,6 +1277,7 @@
             QwenImageEditPlusPipeline,
             QwenImageImg2ImgPipeline,
             QwenImageInpaintPipeline,
+            QwenImageLayeredPipeline,
             QwenImagePipeline,
             ReduxImageEncoder,
             SanaControlNetPipeline,
@@ -1369,6 +1381,8 @@
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
+            ZImageControlNetInpaintPipeline,
+            ZImageControlNetPipeline,
             ZImageImg2ImgPipeline,
             ZImagePipeline,
         )
 
@@ -49,6 +49,7 @@
     convert_stable_cascade_unet_single_file_to_diffusers,
     convert_wan_transformer_to_diffusers,
     convert_wan_vae_to_diffusers,
+    convert_z_image_controlnet_checkpoint_to_diffusers,
     convert_z_image_transformer_checkpoint_to_diffusers,
     create_controlnet_diffusers_config_from_ldm,
     create_unet_diffusers_config_from_ldm,
@@ -172,11 +173,18 @@
         "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",
     },
+    "ZImageControlNetModel": {
+        "checkpoint_mapping_fn": convert_z_image_controlnet_checkpoint_to_diffusers,
+    },
 }
 
 
 def _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint_state_dict):
-    return not set(model_state_dict.keys()).issubset(set(checkpoint_state_dict.keys()))
+    model_state_dict_keys = set(model_state_dict.keys())
+    checkpoint_state_dict_keys = set(checkpoint_state_dict.keys())
+    is_subset = model_state_dict_keys.issubset(checkpoint_state_dict_keys)
+    is_match = model_state_dict_keys == checkpoint_state_dict_keys
+    return not (is_subset and is_match)
 
 
 def _get_single_file_loadable_mapping_class(cls):
 
@@ -121,6 +121,8 @@
     "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight",
     "lumina2": ["model.diffusion_model.cap_embedder.0.weight", "cap_embedder.0.weight"],
     "z-image-turbo": "cap_embedder.0.weight",
+    "z-image-turbo-controlnet": "control_all_x_embedder.2-1.weight",
+    "z-image-turbo-controlnet-2.x": "control_layers.14.adaLN_modulation.0.weight",
     "sana": [
         "blocks.0.cross_attn.q_linear.weight",
         "blocks.0.cross_attn.q_linear.bias",
@@ -220,6 +222,8 @@
     "cosmos-2.0-v2w-2B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-2B-Video2World"},
     "cosmos-2.0-v2w-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Video2World"},
     "z-image-turbo": {"pretrained_model_name_or_path": "Tongyi-MAI/Z-Image-Turbo"},
+    "z-image-turbo-controlnet": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union"},
+    "z-image-turbo-controlnet-2.x": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.1"},
 }
 
 # Use to configure model sample size when original config is provided
@@ -779,6 +783,12 @@ def infer_diffusers_model_type(checkpoint):
         else:
             raise ValueError(f"Unexpected x_embedder shape: {x_embedder_shape} when loading Cosmos 2.0 model.")
 
+    elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet-2.x"] in checkpoint:
+        model_type = "z-image-turbo-controlnet-2.x"
+
+    elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet"] in checkpoint:
+        model_type = "z-image-turbo-controlnet"
+
     else:
         model_type = "v1"
 
@@ -3885,3 +3895,17 @@ def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str)
             handler_fn_inplace(key, converted_state_dict)
 
     return converted_state_dict
+
+
+def convert_z_image_controlnet_checkpoint_to_diffusers(checkpoint, config, **kwargs):
+    if config["add_control_noise_refiner"] is None:
+        return checkpoint
+    elif config["add_control_noise_refiner"] == "control_noise_refiner":
+        return checkpoint
+    elif config["add_control_noise_refiner"] == "control_layers":
+        converted_state_dict = {
+            key: checkpoint.pop(key) for key in list(checkpoint.keys()) if not key.startswith("control_noise_refiner.")
+        }
+        return converted_state_dict
+    else:
+        raise ValueError("Unknown Z-Image Turbo ControlNet type.")
@@ -66,6 +66,7 @@
     _import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
     _import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
     _import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
+    _import_structure["controlnets.controlnet_z_image"] = ["ZImageControlNetModel"]
     _import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
     _import_structure["controlnets.multicontrolnet_union"] = ["MultiControlNetUnionModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
@@ -101,6 +102,7 @@
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
     _import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
+    _import_structure["transformers.transformer_longcat_image"] = ["LongCatImageTransformer2DModel"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
@@ -180,6 +182,7 @@
             SD3MultiControlNetModel,
             SparseControlNetModel,
             UNetControlNetXSModel,
+            ZImageControlNetModel,
         )
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin
@@ -208,6 +211,7 @@
             HunyuanVideoTransformer3DModel,
             Kandinsky5Transformer3DModel,
             LatteTransformer3DModel,
+            LongCatImageTransformer2DModel,
             LTXVideoTransformer3DModel,
             Lumina2Transformer2DModel,
             LuminaNextDiT2DModel,