Skip to content

Commit b95637a

Browse files
authored
[fix] CLIPTextModel with transformers >= 5.6 and from_single_file (#13843)
* fix * code quality
1 parent e1db6d1 commit b95637a

1 file changed

Lines changed: 9 additions & 1 deletion

File tree

src/diffusers/loaders/single_file_utils.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,10 @@ def create_diffusers_clip_model_from_ldm(
17021702
with ctx():
17031703
model = cls(model_config)
17041704

1705-
position_embedding_dim = model.text_model.embeddings.position_embedding.weight.shape[-1]
1705+
# `CLIPTextModel` was flattened in transformers >=5.6; `CLIPTextModelWithProjection` still wraps via `text_model`.
1706+
has_text_model_wrapper = hasattr(model, "text_model")
1707+
text_model = model.text_model if has_text_model_wrapper else model
1708+
position_embedding_dim = text_model.embeddings.position_embedding.weight.shape[-1]
17061709

17071710
if is_clip_model(checkpoint):
17081711
diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint)
@@ -1744,6 +1747,11 @@ def create_diffusers_clip_model_from_ldm(
17441747
else:
17451748
raise ValueError("The provided checkpoint does not seem to contain a valid CLIP model.")
17461749

1750+
if not has_text_model_wrapper:
1751+
diffusers_format_checkpoint = {
1752+
k.removeprefix("text_model."): v for k, v in diffusers_format_checkpoint.items()
1753+
}
1754+
17471755
if is_accelerate_available():
17481756
load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
17491757
empty_device_cache()

0 commit comments

Comments
 (0)