Skip to content

Commit 301c223

Browse files
authored
Merge branch 'main' into cp-fixes-attn-backends
2 parents 3b1ccd7 + 55463f7 commit 301c223

29 files changed

Lines changed: 5852 additions & 19 deletions

docs/source/en/_toctree.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,8 @@
365365
title: HunyuanVideoTransformer3DModel
366366
- local: api/models/latte_transformer3d
367367
title: LatteTransformer3DModel
368+
- local: api/models/longcat_image_transformer2d
369+
title: LongCatImageTransformer2DModel
368370
- local: api/models/ltx_video_transformer3d
369371
title: LTXVideoTransformer3DModel
370372
- local: api/models/lumina2_transformer2d
@@ -402,7 +404,7 @@
402404
- local: api/models/wan_transformer_3d
403405
title: WanTransformer3DModel
404406
- local: api/models/z_image_transformer2d
405-
title: ZImageTransformer2DModel
407+
title: ZImageTransformer2DModel
406408
title: Transformers
407409
- sections:
408410
- local: api/models/stable_cascade_unet
@@ -563,6 +565,8 @@
563565
title: Latent Diffusion
564566
- local: api/pipelines/ledits_pp
565567
title: LEDITS++
568+
- local: api/pipelines/longcat_image
569+
title: LongCat-Image
566570
- local: api/pipelines/lumina2
567571
title: Lumina 2.0
568572
- local: api/pipelines/lumina
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
-->
12+
13+
# LongCatImageTransformer2DModel
14+
15+
The model can be loaded with the following code snippet.
16+
17+
```python
18+
from diffusers import LongCatImageTransformer2DModel
19+
20+
transformer = LongCatImageTransformer2DModel.from_pretrained("meituan-longcat/LongCat-Image ", subfolder="transformer", torch_dtype=torch.bfloat16)
21+
```
22+
23+
## LongCatImageTransformer2DModel
24+
25+
[[autodoc]] LongCatImageTransformer2DModel
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
-->
12+
13+
# LongCat-Image
14+
15+
<div class="flex flex-wrap space-x-1">
16+
<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
17+
</div>
18+
19+
20+
We introduce LongCat-Image, a pioneering open-source and bilingual (Chinese-English) foundation model for image generation, designed to address core challenges in multilingual text rendering, photorealism, deployment efficiency, and developer accessibility prevalent in current leading models.
21+
22+
23+
### Key Features
24+
- 🌟 **Exceptional Efficiency and Performance**: With only **6B parameters**, LongCat-Image surpasses numerous open-source models that are several times larger across multiple benchmarks, demonstrating the immense potential of efficient model design.
25+
- 🌟 **Superior Editing Performance**: LongCat-Image-Edit model achieves state-of-the-art performance among open-source models, delivering leading instruction-following and image quality with superior visual consistency.
26+
- 🌟 **Powerful Chinese Text Rendering**: LongCat-Image demonstrates superior accuracy and stability in rendering common Chinese characters compared to existing SOTA open-source models and achieves industry-leading coverage of the Chinese dictionary.
27+
- 🌟 **Remarkable Photorealism**: Through an innovative data strategy and training framework, LongCat-Image achieves remarkable photorealism in generated images.
28+
- 🌟 **Comprehensive Open-Source Ecosystem**: We provide a complete toolchain, from intermediate checkpoints to full training code, significantly lowering the barrier for further research and development.
29+
30+
For more details, please refer to the comprehensive [***LongCat-Image Technical Report***](https://arxiv.org/abs/2412.11963)
31+
32+
33+
## Usage Example
34+
35+
```py
36+
import torch
37+
import diffusers
38+
from diffusers import LongCatImagePipeline
39+
40+
weight_dtype = torch.bfloat16
41+
pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16 )
42+
pipe.to('cuda')
43+
# pipe.enable_model_cpu_offload()
44+
45+
prompt = '一个年轻的亚裔女性,身穿黄色针织衫,搭配白色项链。她的双手放在膝盖上,表情恬静。背景是一堵粗糙的砖墙,午后的阳光温暖地洒在她身上,营造出一种宁静而温馨的氛围。镜头采用中距离视角,突出她的神态和服饰的细节。光线柔和地打在她的脸上,强调她的五官和饰品的质感,增加画面的层次感与亲和力。整个画面构图简洁,砖墙的纹理与阳光的光影效果相得益彰,突显出人物的优雅与从容。'
46+
image = pipe(
47+
prompt,
48+
height=768,
49+
width=1344,
50+
guidance_scale=4.0,
51+
num_inference_steps=50,
52+
num_images_per_prompt=1,
53+
generator=torch.Generator("cpu").manual_seed(43),
54+
enable_cfg_renorm=True,
55+
enable_prompt_rewrite=True,
56+
).images[0]
57+
image.save(f'./longcat_image_t2i_example.png')
58+
```
59+
60+
61+
This pipeline was contributed by LongCat-Image Team. The original codebase can be found [here](https://github.com/meituan-longcat/LongCat-Image).
62+
63+
Available models:
64+
<div style="overflow-x: auto; margin-bottom: 16px;">
65+
<table style="border-collapse: collapse; width: 100%;">
66+
<thead>
67+
<tr>
68+
<th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Models</th>
69+
<th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Type</th>
70+
<th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Description</th>
71+
<th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Download Link</th>
72+
</tr>
73+
</thead>
74+
<tbody>
75+
<tr>
76+
<td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image</td>
77+
<td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
78+
<td style="padding: 8px; border: 1px solid #d0d7de;">Final Release. The standard model for out&#8209;of&#8209;the&#8209;box inference.</td>
79+
<td style="padding: 8px; border: 1px solid #d0d7de;">
80+
<span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image">Huggingface</a></span>
81+
</td>
82+
</tr>
83+
<tr>
84+
<td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Dev</td>
85+
<td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
86+
<td style="padding: 8px; border: 1px solid #d0d7de;">Development. Mid-training checkpoint, suitable for fine-tuning.</td>
87+
<td style="padding: 8px; border: 1px solid #d0d7de;">
88+
<span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Dev">Huggingface</a></span>
89+
</td>
90+
</tr>
91+
<tr>
92+
<td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Edit</td>
93+
<td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Image Editing</td>
94+
<td style="padding: 8px; border: 1px solid #d0d7de;">Specialized model for image editing.</td>
95+
<td style="padding: 8px; border: 1px solid #d0d7de;">
96+
<span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Edit">Huggingface</a></span>
97+
</td>
98+
</tr>
99+
</tbody>
100+
</table>
101+
</div>
102+
103+
## LongCatImagePipeline
104+
105+
[[autodoc]] LongCatImagePipeline
106+
- all
107+
- __call__
108+
109+
## LongCatImagePipelineOutput
110+
111+
[[autodoc]] pipelines.longcat_image.pipeline_output.LongCatImagePipelineOutput
112+
113+
114+

src/diffusers/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@
235235
"Kandinsky3UNet",
236236
"Kandinsky5Transformer3DModel",
237237
"LatteTransformer3DModel",
238+
"LongCatImageTransformer2DModel",
238239
"LTXVideoTransformer3DModel",
239240
"Lumina2Transformer2DModel",
240241
"LuminaNextDiT2DModel",
@@ -278,6 +279,7 @@
278279
"WanAnimateTransformer3DModel",
279280
"WanTransformer3DModel",
280281
"WanVACETransformer3DModel",
282+
"ZImageControlNetModel",
281283
"ZImageTransformer2DModel",
282284
"attention_backend",
283285
]
@@ -532,6 +534,8 @@
532534
"LDMTextToImagePipeline",
533535
"LEditsPPPipelineStableDiffusion",
534536
"LEditsPPPipelineStableDiffusionXL",
537+
"LongCatImageEditPipeline",
538+
"LongCatImagePipeline",
535539
"LTXConditionPipeline",
536540
"LTXImageToVideoPipeline",
537541
"LTXLatentUpsamplePipeline",
@@ -561,6 +565,7 @@
561565
"QwenImageEditPlusPipeline",
562566
"QwenImageImg2ImgPipeline",
563567
"QwenImageInpaintPipeline",
568+
"QwenImageLayeredPipeline",
564569
"QwenImagePipeline",
565570
"ReduxImageEncoder",
566571
"SanaControlNetPipeline",
@@ -666,6 +671,8 @@
666671
"WuerstchenCombinedPipeline",
667672
"WuerstchenDecoderPipeline",
668673
"WuerstchenPriorPipeline",
674+
"ZImageControlNetInpaintPipeline",
675+
"ZImageControlNetPipeline",
669676
"ZImageImg2ImgPipeline",
670677
"ZImagePipeline",
671678
]
@@ -970,6 +977,7 @@
970977
Kandinsky3UNet,
971978
Kandinsky5Transformer3DModel,
972979
LatteTransformer3DModel,
980+
LongCatImageTransformer2DModel,
973981
LTXVideoTransformer3DModel,
974982
Lumina2Transformer2DModel,
975983
LuminaNextDiT2DModel,
@@ -1012,6 +1020,7 @@
10121020
WanAnimateTransformer3DModel,
10131021
WanTransformer3DModel,
10141022
WanVACETransformer3DModel,
1023+
ZImageControlNetModel,
10151024
ZImageTransformer2DModel,
10161025
attention_backend,
10171026
)
@@ -1237,6 +1246,8 @@
12371246
LDMTextToImagePipeline,
12381247
LEditsPPPipelineStableDiffusion,
12391248
LEditsPPPipelineStableDiffusionXL,
1249+
LongCatImageEditPipeline,
1250+
LongCatImagePipeline,
12401251
LTXConditionPipeline,
12411252
LTXImageToVideoPipeline,
12421253
LTXLatentUpsamplePipeline,
@@ -1266,6 +1277,7 @@
12661277
QwenImageEditPlusPipeline,
12671278
QwenImageImg2ImgPipeline,
12681279
QwenImageInpaintPipeline,
1280+
QwenImageLayeredPipeline,
12691281
QwenImagePipeline,
12701282
ReduxImageEncoder,
12711283
SanaControlNetPipeline,
@@ -1369,6 +1381,8 @@
13691381
WuerstchenCombinedPipeline,
13701382
WuerstchenDecoderPipeline,
13711383
WuerstchenPriorPipeline,
1384+
ZImageControlNetInpaintPipeline,
1385+
ZImageControlNetPipeline,
13721386
ZImageImg2ImgPipeline,
13731387
ZImagePipeline,
13741388
)

src/diffusers/loaders/single_file_model.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
convert_stable_cascade_unet_single_file_to_diffusers,
5050
convert_wan_transformer_to_diffusers,
5151
convert_wan_vae_to_diffusers,
52+
convert_z_image_controlnet_checkpoint_to_diffusers,
5253
convert_z_image_transformer_checkpoint_to_diffusers,
5354
create_controlnet_diffusers_config_from_ldm,
5455
create_unet_diffusers_config_from_ldm,
@@ -172,11 +173,18 @@
172173
"checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
173174
"default_subfolder": "transformer",
174175
},
176+
"ZImageControlNetModel": {
177+
"checkpoint_mapping_fn": convert_z_image_controlnet_checkpoint_to_diffusers,
178+
},
175179
}
176180

177181

178182
def _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint_state_dict):
179-
return not set(model_state_dict.keys()).issubset(set(checkpoint_state_dict.keys()))
183+
model_state_dict_keys = set(model_state_dict.keys())
184+
checkpoint_state_dict_keys = set(checkpoint_state_dict.keys())
185+
is_subset = model_state_dict_keys.issubset(checkpoint_state_dict_keys)
186+
is_match = model_state_dict_keys == checkpoint_state_dict_keys
187+
return not (is_subset and is_match)
180188

181189

182190
def _get_single_file_loadable_mapping_class(cls):

src/diffusers/loaders/single_file_utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@
121121
"instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight",
122122
"lumina2": ["model.diffusion_model.cap_embedder.0.weight", "cap_embedder.0.weight"],
123123
"z-image-turbo": "cap_embedder.0.weight",
124+
"z-image-turbo-controlnet": "control_all_x_embedder.2-1.weight",
125+
"z-image-turbo-controlnet-2.x": "control_layers.14.adaLN_modulation.0.weight",
124126
"sana": [
125127
"blocks.0.cross_attn.q_linear.weight",
126128
"blocks.0.cross_attn.q_linear.bias",
@@ -220,6 +222,8 @@
220222
"cosmos-2.0-v2w-2B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-2B-Video2World"},
221223
"cosmos-2.0-v2w-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Video2World"},
222224
"z-image-turbo": {"pretrained_model_name_or_path": "Tongyi-MAI/Z-Image-Turbo"},
225+
"z-image-turbo-controlnet": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union"},
226+
"z-image-turbo-controlnet-2.x": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.1"},
223227
}
224228

225229
# Use to configure model sample size when original config is provided
@@ -779,6 +783,12 @@ def infer_diffusers_model_type(checkpoint):
779783
else:
780784
raise ValueError(f"Unexpected x_embedder shape: {x_embedder_shape} when loading Cosmos 2.0 model.")
781785

786+
elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet-2.x"] in checkpoint:
787+
model_type = "z-image-turbo-controlnet-2.x"
788+
789+
elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet"] in checkpoint:
790+
model_type = "z-image-turbo-controlnet"
791+
782792
else:
783793
model_type = "v1"
784794

@@ -3885,3 +3895,17 @@ def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str)
38853895
handler_fn_inplace(key, converted_state_dict)
38863896

38873897
return converted_state_dict
3898+
3899+
3900+
def convert_z_image_controlnet_checkpoint_to_diffusers(checkpoint, config, **kwargs):
3901+
if config["add_control_noise_refiner"] is None:
3902+
return checkpoint
3903+
elif config["add_control_noise_refiner"] == "control_noise_refiner":
3904+
return checkpoint
3905+
elif config["add_control_noise_refiner"] == "control_layers":
3906+
converted_state_dict = {
3907+
key: checkpoint.pop(key) for key in list(checkpoint.keys()) if not key.startswith("control_noise_refiner.")
3908+
}
3909+
return converted_state_dict
3910+
else:
3911+
raise ValueError("Unknown Z-Image Turbo ControlNet type.")

src/diffusers/models/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
_import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
6767
_import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
6868
_import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
69+
_import_structure["controlnets.controlnet_z_image"] = ["ZImageControlNetModel"]
6970
_import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
7071
_import_structure["controlnets.multicontrolnet_union"] = ["MultiControlNetUnionModel"]
7172
_import_structure["embeddings"] = ["ImageProjection"]
@@ -101,6 +102,7 @@
101102
_import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
102103
_import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
103104
_import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
105+
_import_structure["transformers.transformer_longcat_image"] = ["LongCatImageTransformer2DModel"]
104106
_import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
105107
_import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
106108
_import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
@@ -180,6 +182,7 @@
180182
SD3MultiControlNetModel,
181183
SparseControlNetModel,
182184
UNetControlNetXSModel,
185+
ZImageControlNetModel,
183186
)
184187
from .embeddings import ImageProjection
185188
from .modeling_utils import ModelMixin
@@ -208,6 +211,7 @@
208211
HunyuanVideoTransformer3DModel,
209212
Kandinsky5Transformer3DModel,
210213
LatteTransformer3DModel,
214+
LongCatImageTransformer2DModel,
211215
LTXVideoTransformer3DModel,
212216
Lumina2Transformer2DModel,
213217
LuminaNextDiT2DModel,

0 commit comments

Comments
 (0)