Skip to content

ovis_image model/pipeline review #13630

@hlky

Description

@hlky

ovis_image model/pipeline review

Commit tested: 0f1abc4ae8b0eb2a3b40e82a310507281144c423

Review performed against the repository review rules. AGENTS.md is referenced by .ai/review-rules.md but is not present in this checkout; the other referenced rule files were read and applied.

Files/categories reviewed: target pipeline/model files, public imports and lazy loading, config/serialization, dtype/device/offload paths, attention processor behavior, docs, and fast/slow test coverage.

Duplicate search status: gh search hit the GitHub API rate limit, so I checked GitHub web issue/PR searches for OvisImage, ovis_image, OvisImageTransformer2DModel AttentionMixin, and OvisImagePipeline num_images_per_prompt. I did not find an exact Ovis duplicate. Related but not duplicate: #12186 covers the same missing-AttentionMixin pattern for WanVACETransformer3DModel.

Issue 1: Transformer does not expose attention processor APIs

Affected code:

from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
from ...utils import logging
from ...utils.torch_utils import maybe_allow_in_graph
from ..attention import AttentionModuleMixin, FeedForward
from ..attention_dispatch import dispatch_attention_fn
from ..cache_utils import CacheMixin

class OvisImageTransformer2DModel(
ModelMixin,
ConfigMixin,
PeftAdapterMixin,
FromOriginalModelMixin,
CacheMixin,
):

Problem:
OvisImageTransformer2DModel defines OvisImageAttention modules but does not inherit AttentionMixin. That leaves the model without the standard attn_processors, set_attn_processor, fuse_qkv_projections, and unfuse_qkv_projections APIs expected by related transformer families.

Impact:
Users and tests cannot swap attention processors, inspect processors, or use QKV fusion through the model-level API.

Reproduction:

from diffusers import OvisImageTransformer2DModel

print(hasattr(OvisImageTransformer2DModel, "set_attn_processor"))
print(hasattr(OvisImageTransformer2DModel, "fuse_qkv_projections"))
assert hasattr(OvisImageTransformer2DModel, "set_attn_processor")

Relevant precedent:

from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
from ..attention_dispatch import dispatch_attention_fn
from ..cache_utils import CacheMixin
from ..embeddings import (
CombinedTimestepGuidanceTextProjEmbeddings,
CombinedTimestepTextProjEmbeddings,

class FluxTransformer2DModel(
ModelMixin,
ConfigMixin,
PeftAdapterMixin,
FromOriginalModelMixin,
FluxTransformer2DLoadersMixin,
CacheMixin,
AttentionMixin,
):

from ..attention import AttentionMixin, FeedForward

Suggested fix:

from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward

class OvisImageTransformer2DModel(
    ModelMixin,
    ConfigMixin,
    PeftAdapterMixin,
    FromOriginalModelMixin,
    CacheMixin,
    AttentionMixin,
):
    ...

Issue 2: joint_attention_kwargs is accepted but never reaches attention

Affected code:

self._joint_attention_kwargs = joint_attention_kwargs
self._current_timestep = None
self._interrupt = False
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
device = self._execution_device
do_classifier_free_guidance = guidance_scale > 1
(
prompt_embeds,
text_ids,
) = self.encode_prompt(
prompt=prompt,
prompt_embeds=prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
if do_classifier_free_guidance:
(
negative_prompt_embeds,
negative_text_ids,
) = self.encode_prompt(
prompt=negative_prompt,
prompt_embeds=negative_prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
# 4. Prepare latent variables
num_channels_latents = self.transformer.config.in_channels // 4
latents, latent_image_ids = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
height,
width,
prompt_embeds.dtype,
device,
generator,
latents,
)
# 5. Prepare timesteps
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
if hasattr(self.scheduler.config, "use_flow_sigmas") and self.scheduler.config.use_flow_sigmas:
sigmas = None
image_seq_len = latents.shape[1]
mu = calculate_shift(
image_seq_len,
self.scheduler.config.get("base_image_seq_len", 256),
self.scheduler.config.get("max_image_seq_len", 4096),
self.scheduler.config.get("base_shift", 0.5),
self.scheduler.config.get("max_shift", 1.15),
)
timesteps, num_inference_steps = retrieve_timesteps(
self.scheduler,
num_inference_steps,
device,
sigmas=sigmas,
mu=mu,
)
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
self._num_timesteps = len(timesteps)
if self.joint_attention_kwargs is None:
self._joint_attention_kwargs = {}

with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latents,
timestep=timestep / 1000,
encoder_hidden_states=prompt_embeds,
txt_ids=text_ids,
img_ids=latent_image_ids,
return_dict=False,
)[0]
if do_classifier_free_guidance:
with self.transformer.cache_context("uncond"):
neg_noise_pred = self.transformer(
hidden_states=latents,
timestep=timestep / 1000,
encoder_hidden_states=negative_prompt_embeds,
txt_ids=negative_text_ids,
img_ids=latent_image_ids,
return_dict=False,
)[0]

def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor = None,
timestep: torch.LongTensor = None,
img_ids: torch.Tensor = None,
txt_ids: torch.Tensor = None,
return_dict: bool = True,
) -> torch.Tensor | Transformer2DModelOutput:

for index_block, block in enumerate(self.transformer_blocks):
if torch.is_grad_enabled() and self.gradient_checkpointing:
encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
block,
hidden_states,
encoder_hidden_states,
temb,
image_rotary_emb,
)
else:
encoder_hidden_states, hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb=temb,
image_rotary_emb=image_rotary_emb,
)
for index_block, block in enumerate(self.single_transformer_blocks):
if torch.is_grad_enabled() and self.gradient_checkpointing:
encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
block,
hidden_states,
encoder_hidden_states,
temb,
image_rotary_emb,
)
else:
encoder_hidden_states, hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb=temb,
image_rotary_emb=image_rotary_emb,
)

Problem:
The pipeline exposes joint_attention_kwargs, but OvisImageTransformer2DModel.forward() does not accept it, and the pipeline transformer calls do not pass it. The block classes already accept joint_attention_kwargs, so the plumbing is incomplete.

Impact:
Any user-provided attention kwargs are silently ignored by the pipeline. Direct model calls with the same argument fail.

Reproduction:

import inspect
from diffusers import OvisImagePipeline, OvisImageTransformer2DModel

print("pipeline:", "joint_attention_kwargs" in inspect.signature(OvisImagePipeline.__call__).parameters)
print("model:", "joint_attention_kwargs" in inspect.signature(OvisImageTransformer2DModel.forward).parameters)
assert "joint_attention_kwargs" in inspect.signature(OvisImageTransformer2DModel.forward).parameters

Relevant precedent:

joint_attention_kwargs: dict[str, Any] | None = None,
controlnet_block_samples=None,

joint_attention_kwargs,
)
else:
encoder_hidden_states, hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb=temb,
image_rotary_emb=image_rotary_emb,
joint_attention_kwargs=joint_attention_kwargs,

Suggested fix:

def forward(..., joint_attention_kwargs: dict[str, Any] | None = None, return_dict: bool = True):
    ...
    encoder_hidden_states, hidden_states = block(
        hidden_states=hidden_states,
        encoder_hidden_states=encoder_hidden_states,
        temb=temb,
        image_rotary_emb=image_rotary_emb,
        joint_attention_kwargs=joint_attention_kwargs,
    )

Also pass joint_attention_kwargs=self.joint_attention_kwargs in both pipeline transformer calls.

Issue 3: guidance_scale property is never initialized

Affected code:

def guidance_scale(self):
return self._guidance_scale

self._joint_attention_kwargs = joint_attention_kwargs
self._current_timestep = None
self._interrupt = False

Problem:
OvisImagePipeline.guidance_scale returns self._guidance_scale, but __call__ never assigns self._guidance_scale = guidance_scale.

Impact:
Callbacks or downstream code that read pipe.guidance_scale during generation can hit an AttributeError or stale state, unlike related pipelines.

Reproduction:

import inspect
from diffusers import OvisImagePipeline

source = inspect.getsource(OvisImagePipeline.__call__)
print("self._guidance_scale = guidance_scale" in source)
assert "self._guidance_scale = guidance_scale" in source

Relevant precedent:

self._guidance_scale = guidance_scale
self._joint_attention_kwargs = joint_attention_kwargs
self._current_timestep = None

self._guidance_scale = guidance_scale
self._attention_kwargs = attention_kwargs
self._current_timestep = None

Suggested fix:

self._guidance_scale = guidance_scale
self._joint_attention_kwargs = joint_attention_kwargs
self._current_timestep = None
self._interrupt = False

Issue 4: Batched prompts break with default negative prompt under CFG

Affected code:

if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
)

# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
device = self._execution_device
do_classifier_free_guidance = guidance_scale > 1
(
prompt_embeds,
text_ids,
) = self.encode_prompt(
prompt=prompt,
prompt_embeds=prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
if do_classifier_free_guidance:
(
negative_prompt_embeds,
negative_text_ids,
) = self.encode_prompt(
prompt=negative_prompt,
prompt_embeds=negative_prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
)

Problem:
With prompt as a list and default negative_prompt="", positive embeddings are batched to len(prompt), but negative embeddings are encoded as batch size 1. CFG then calls the transformer with mismatched latent and negative prompt batch sizes.

Impact:
The default CFG path fails for normal batched text-to-image usage unless users manually pass a negative prompt list of matching length.

Reproduction:

import torch
from diffusers import OvisImagePipeline

pipe = OvisImagePipeline.__new__(OvisImagePipeline)
pipe.text_encoder = type("E", (), {"dtype": torch.float32})()
pipe.transformer = type("T", (), {"dtype": torch.float32})()

def fake_get_ovis_prompt_embeds(prompt, num_images_per_prompt=1, device=None, dtype=None):
    prompt = [prompt] if isinstance(prompt, str) else prompt
    return torch.zeros(len(prompt) * num_images_per_prompt, 4, 8)

pipe._get_ovis_prompt_embeds = fake_get_ovis_prompt_embeds
pos, _ = pipe.encode_prompt(["cat", "dog"], device=torch.device("cpu"))
neg, _ = pipe.encode_prompt("", device=torch.device("cpu"))
print(pos.shape[0], neg.shape[0])
assert pos.shape[0] == neg.shape[0]

Relevant precedent:

if do_classifier_free_guidance and negative_prompt_embeds is None:
negative_prompt = negative_prompt or ""
uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens,
truncation=True,
max_length=max_length,
padding="max_length",
return_tensors="pt",
)
uncond_input = {k: v.to(device) for k, v in uncond_input.items()}
negative_prompt_embeds = self.text_encoder(**uncond_input)[0]
negative_prompt_attention_mask = (
uncond_input["attention_mask"].unsqueeze(-1).expand(negative_prompt_embeds.shape)
)
negative_prompt_embeds = negative_prompt_embeds * negative_prompt_attention_mask
if do_classifier_free_guidance:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

Suggested fix:

if do_classifier_free_guidance and negative_prompt_embeds is None:
    if negative_prompt is None:
        negative_prompt = ""
    if isinstance(negative_prompt, str):
        negative_prompt = [negative_prompt] * batch_size
    elif len(negative_prompt) != batch_size:
        raise ValueError(
            f"`negative_prompt` has batch size {len(negative_prompt)}, but `prompt` has batch size {batch_size}."
        )

Issue 5: Precomputed prompt_embeds are not moved or repeated

Affected code:

def encode_prompt(
self,
prompt: str | list[str],
device: torch.device | None = None,
num_images_per_prompt: int = 1,
prompt_embeds: torch.FloatTensor | None = None,
):
r"""
Args:
prompt (`str`, *optional*):
prompt to be encoded
device: (`torch.device`):
torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
"""
device = device or self._execution_device
if prompt_embeds is None:
prompt_embeds = self._get_ovis_prompt_embeds(
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
text_ids = torch.zeros(prompt_embeds.shape[1], 3)
text_ids[..., 1] = text_ids[..., 1] + torch.arange(prompt_embeds.shape[1])[None, :]
text_ids[..., 2] = text_ids[..., 2] + torch.arange(prompt_embeds.shape[1])[None, :]
text_ids = text_ids.to(device=device, dtype=dtype)
return prompt_embeds, text_ids

# 4. Prepare latent variables
num_channels_latents = self.transformer.config.in_channels // 4
latents, latent_image_ids = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
height,
width,
prompt_embeds.dtype,
device,
generator,
latents,
)

Problem:
When prompt_embeds is supplied, encode_prompt() does not move it to the execution device/dtype and does not repeat it for num_images_per_prompt. The pipeline still prepares latents for batch_size * num_images_per_prompt.

Impact:
Precomputed embeddings can fail with device mismatches on GPU/offload paths and batch mismatches when generating multiple images per prompt.

Reproduction:

import torch
from diffusers import OvisImagePipeline

pipe = OvisImagePipeline.__new__(OvisImagePipeline)
pipe.text_encoder = None
pipe.transformer = type("T", (), {"dtype": torch.float16})()

embeds = torch.randn(1, 4, 8, dtype=torch.float32)
out, ids = pipe.encode_prompt(None, device=torch.device("meta"), prompt_embeds=embeds, num_images_per_prompt=2)

print(out.shape, out.device, ids.device)
assert out.shape[0] == 2
assert out.device == ids.device

Relevant precedent:

prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
if prompt_embeds is None:
prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
prompt_embeds = prompt_embeds[:, :max_sequence_length]
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
if prompt_embeds_mask is not None:
prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)

Suggested fix:

batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]

if prompt_embeds is None:
    prompt_embeds = self._get_ovis_prompt_embeds(...)

dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

Issue 6: max_sequence_length is validated but ignored

Affected code:

def _get_ovis_prompt_embeds(
self,
prompt: str | list[str] = None,
num_images_per_prompt: int = 1,
device: torch.device | None = None,
dtype: torch.dtype | None = None,
):
device = device or self._execution_device
dtype = dtype or self.text_encoder.dtype
messages = self._get_messages(prompt)
batch_size = len(messages)
tokens = self.tokenizer(
messages,
padding="max_length",
truncation=True,
max_length=self.tokenizer_max_length,
return_tensors="pt",
add_special_tokens=False,
)
input_ids = tokens.input_ids.to(device)
attention_mask = tokens.attention_mask.to(device)
outputs = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
)
prompt_embeds = outputs.last_hidden_state
prompt_embeds = prompt_embeds * attention_mask[..., None]

def encode_prompt(
self,
prompt: str | list[str],
device: torch.device | None = None,
num_images_per_prompt: int = 1,
prompt_embeds: torch.FloatTensor | None = None,
):
r"""
Args:
prompt (`str`, *optional*):
prompt to be encoded
device: (`torch.device`):
torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
"""
device = device or self._execution_device
if prompt_embeds is None:
prompt_embeds = self._get_ovis_prompt_embeds(
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
text_ids = torch.zeros(prompt_embeds.shape[1], 3)
text_ids[..., 1] = text_ids[..., 1] + torch.arange(prompt_embeds.shape[1])[None, :]
text_ids[..., 2] = text_ids[..., 2] + torch.arange(prompt_embeds.shape[1])[None, :]
text_ids = text_ids.to(device=device, dtype=dtype)
return prompt_embeds, text_ids

callback_on_step_end_tensor_inputs: list[str] = ["latents"],
max_sequence_length: int = 256,
):
r"""
Function invoked when calling the pipeline for generation.
Args:
prompt (`str` or `list[str]`, *optional*):
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
instead.
negative_prompt (`str` or `list[str]`, *optional*):
The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
not greater than `1`).
guidance_scale (`float`, *optional*, defaults to 1.0):
True classifier-free guidance (guidance scale) is enabled when `guidance_scale` > 1 and
`negative_prompt` is provided.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The height in pixels of the generated image. This is set to 1024 by default for the best results.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The width in pixels of the generated image. This is set to 1024 by default for the best results.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
sigmas (`list[float]`, *optional*):
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
will be used.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will be generated by sampling using the supplied random `generator`.
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
joint_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
`self.processor` in
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
callback_on_step_end (`Callable`, *optional*):
A function that calls at the end of each denoising steps during the inference. The function is called
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
`callback_on_step_end_tensor_inputs`.
callback_on_step_end_tensor_inputs (`List`, *optional*):
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
Examples:
Returns:
[`~pipelines.ovis_image.OvisImagePipelineOutput`] or `tuple`:
[`~pipelines.ovis_image.OvisImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
returning a tuple, the first element is a list with the generated images.
"""
height = height or self.default_sample_size * self.vae_scale_factor
width = width or self.default_sample_size * self.vae_scale_factor
# 1. Check inputs. Raise error if not correct
self.check_inputs(
prompt,
height,
width,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
max_sequence_length=max_sequence_length,
)
self._joint_attention_kwargs = joint_attention_kwargs
self._current_timestep = None
self._interrupt = False
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
device = self._execution_device
do_classifier_free_guidance = guidance_scale > 1
(
prompt_embeds,
text_ids,
) = self.encode_prompt(
prompt=prompt,
prompt_embeds=prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
)
if do_classifier_free_guidance:
(
negative_prompt_embeds,
negative_text_ids,
) = self.encode_prompt(
prompt=negative_prompt,
prompt_embeds=negative_prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
)

Problem:
__call__ accepts and validates max_sequence_length, but encode_prompt() has no such parameter and _get_ovis_prompt_embeds() always tokenizes with self.tokenizer_max_length.

Impact:
Users cannot reduce prompt sequence length for speed/memory, and the public argument is misleading.

Reproduction:

import inspect
from diffusers import OvisImagePipeline

assert "max_sequence_length" in inspect.signature(OvisImagePipeline.__call__).parameters
assert "max_sequence_length" in inspect.signature(OvisImagePipeline.encode_prompt).parameters

Relevant precedent:

def encode_prompt(
self,
prompt: str | list[str],
device: torch.device | None = None,
num_images_per_prompt: int = 1,
prompt_embeds: torch.Tensor | None = None,
prompt_embeds_mask: torch.Tensor | None = None,
max_sequence_length: int = 1024,
):
r"""
Args:
prompt (`str` or `list[str]`, *optional*):
prompt to be encoded
device: (`torch.device`):
torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
"""
device = device or self._execution_device
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
if prompt_embeds is None:
prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
prompt_embeds = prompt_embeds[:, :max_sequence_length]
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
if prompt_embeds_mask is not None:
prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)

Suggested fix:

def _get_ovis_prompt_embeds(..., max_sequence_length: int = 256):
    max_length = max_sequence_length + self.user_prompt_begin_id
    tokens = self.tokenizer(..., max_length=max_length, ...)
    prompt_embeds = prompt_embeds[:, self.user_prompt_begin_id : self.user_prompt_begin_id + max_sequence_length, :]

def encode_prompt(..., max_sequence_length: int = 256):
    ...

Then pass max_sequence_length=max_sequence_length from __call__.

Issue 7: No fast or slow tests cover ovis_image

Affected code:
https://github.com/huggingface/diffusers/blob/0f1abc4ae8b0eb2a3b40e82a310507281144c423/tests/pipelines/ovis_image/__init__.py#L1

Problem:
tests/pipelines/ovis_image/ only contains an empty __init__.py, and there is no model test for OvisImageTransformer2DModel. No fast or slow tests reference OvisImage or ovis_image.

Impact:
The import surface, prompt batching, callback properties, attention APIs, serialization, and slow checkpoint path can regress without CI coverage.

Reproduction:

from pathlib import Path

paths = []
for path in Path("tests").rglob("test*.py"):
    text = path.read_text(encoding="utf-8", errors="ignore")
    if "OvisImage" in text or "ovis_image" in text or "ovis-image" in text.lower():
        paths.append(str(path))

print(paths)
assert paths, "No fast or slow Ovis tests found"

Relevant precedent:

class QwenImageTransformerTesterConfig(BaseModelTesterConfig):
@property
def model_class(self):
return QwenImageTransformer2DModel
@property
def output_shape(self) -> tuple[int, int]:
return (16, 16)
@property
def input_shape(self) -> tuple[int, int]:
return (16, 16)
@property
def model_split_percents(self) -> list:
return [0.7, 0.6, 0.6]
@property
def main_input_name(self) -> str:
return "hidden_states"
@property
def generator(self):
return torch.Generator("cpu").manual_seed(0)
def get_init_dict(self) -> dict[str, int | list[int]]:
return {
"patch_size": 2,
"in_channels": 16,
"out_channels": 4,
"num_layers": 2,
"attention_head_dim": 16,
"num_attention_heads": 4,
"joint_attention_dim": 16,
"guidance_embeds": False,
"axes_dims_rope": (8, 4, 4),
}

class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = QwenImagePipeline
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback_on_step_end",
"callback_on_step_end_tensor_inputs",
]
)
supports_dduf = False
test_xformers_attention = False
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self):
torch.manual_seed(0)
transformer = QwenImageTransformer2DModel(
patch_size=2,
in_channels=16,
out_channels=4,
num_layers=2,
attention_head_dim=16,
num_attention_heads=3,
joint_attention_dim=16,
guidance_embeds=False,
axes_dims_rope=(8, 4, 4),
)
torch.manual_seed(0)
z_dim = 4
vae = AutoencoderKLQwenImage(
base_dim=z_dim * 6,
z_dim=z_dim,
dim_mult=[1, 2, 4],
num_res_blocks=1,
temperal_downsample=[False, True],
# fmt: off
latents_mean=[0.0] * 4,
latents_std=[1.0] * 4,
# fmt: on
)
torch.manual_seed(0)
scheduler = FlowMatchEulerDiscreteScheduler()
torch.manual_seed(0)
config = Qwen2_5_VLConfig(
text_config={
"hidden_size": 16,
"intermediate_size": 16,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"rope_scaling": {
"mrope_section": [1, 1, 2],
"rope_type": "default",
"type": "default",
},
"rope_theta": 1000000.0,
},
vision_config={
"depth": 2,
"hidden_size": 16,
"intermediate_size": 16,
"num_heads": 2,
"out_hidden_size": 16,
},
hidden_size=16,
vocab_size=152064,
vision_end_token_id=151653,
vision_start_token_id=151652,
vision_token_id=151654,
)
text_encoder = Qwen2_5_VLForConditionalGeneration(config).eval()
tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
components = {
"transformer": transformer,

class FluxPipelineSlowTests(unittest.TestCase):
pipeline_class = FluxPipeline
repo_id = "black-forest-labs/FLUX.1-schnell"
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0):
generator = torch.Generator(device="cpu").manual_seed(seed)
prompt_embeds = torch.load(
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
).to(torch_device)
pooled_prompt_embeds = torch.load(
hf_hub_download(
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
)
).to(torch_device)
return {
"prompt_embeds": prompt_embeds,
"pooled_prompt_embeds": pooled_prompt_embeds,
"num_inference_steps": 2,
"guidance_scale": 0.0,
"max_sequence_length": 256,
"output_type": "np",
"generator": generator,
}
def test_flux_inference(self):
pipe = self.pipeline_class.from_pretrained(
self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
).to(torch_device)
inputs = self.get_inputs(torch_device)
image = pipe(**inputs).images[0]
image_slice = image[0, :10, :10]
# fmt: off
expected_slices = Expectations(
{
("cuda", None): np.array([0.3242, 0.3203, 0.3164, 0.3164, 0.3125, 0.3125, 0.3281, 0.3242, 0.3203, 0.3301, 0.3262, 0.3242, 0.3281, 0.3242, 0.3203, 0.3262, 0.3262, 0.3164, 0.3262, 0.3281, 0.3184, 0.3281, 0.3281, 0.3203, 0.3281, 0.3281, 0.3164, 0.3320, 0.3320, 0.3203], dtype=np.float32,),
("xpu", 3): np.array([0.3301, 0.3281, 0.3359, 0.3203, 0.3203, 0.3281, 0.3281, 0.3301, 0.3340, 0.3281, 0.3320, 0.3359, 0.3281, 0.3301, 0.3320, 0.3242, 0.3301, 0.3281, 0.3242, 0.3320, 0.3320, 0.3281, 0.3320, 0.3320, 0.3262, 0.3320, 0.3301, 0.3301, 0.3359, 0.3320], dtype=np.float32,),
}
)
expected_slice = expected_slices.get_expectation()
# fmt: on
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
self.assertLess(
max_diff, 1e-4, f"Image slice is different from expected slice: {image_slice} != {expected_slice}"
)
@slow
@require_big_accelerator
class FluxIPAdapterPipelineSlowTests(unittest.TestCase):

Suggested fix:
Add a model fast test using ModelTesterMixin and AttentionTesterMixin, add a pipeline fast test with tiny synthetic components, and add at least one @slow pipeline smoke test for the published Ovis checkpoint. These tests should cover top-level imports, save/load, attention processor APIs, batched prompts with CFG, prompt_embeds, callback properties, and max_sequence_length.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions