Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,14 @@ When Stable Diffusion models are exported to the OpenVINO format, they are decom
* VAE encoder
* VAE decoder

For LTX-Video checkpoints with `timestep_conditioning` enabled (for example, LTX-Video 0.9.1), export now handles the VAE decoder conditioning path automatically by adding a dynamic `timestep` input when required by the checkpoint config.

To export a text-to-video checkpoint such as LTX-Video:

```bash
optimum-cli export openvino --model <ltx-video-model-id> --task text-to-video ov_ltx_video/
```

To export your Stable Diffusion XL model to the OpenVINO IR format with the CLI you can do as follows:

```bash
Expand Down
2 changes: 1 addition & 1 deletion docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ Here is the list of the supported architectures :
- Flux
- Sana
- SanaSprint
- LTX
- LTX-Video

## [Timm](https://huggingface.co/docs/timm/index)
- PiT
Expand Down
21 changes: 17 additions & 4 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@
LlavaImageEmbeddingModelPatcher,
LlavaNextVideoImageEmbeddingModelPatcher,
LlavaQwen2ImageEmbeddingsModelPatcher,
LTXVaeDecoderModelPatcher,
MairaImageEmbeddingModelPatcher,
MambaPatcher,
MarianModelPatcher,
Expand Down Expand Up @@ -2806,14 +2807,25 @@ def __init__(
):
super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)
self.num_frames = num_frames
self.sample_num_channels = getattr(normalized_config.config, "in_channels", self.num_channels)
self.latent_num_channels = getattr(normalized_config.config, "latent_channels", self.num_channels)

def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
if input_name in ["sample", "latent_sample"]:
if input_name == "sample":
return self.random_float_tensor(
[self.batch_size, self.num_channels, self.num_frames, self.height, self.width]
[self.batch_size, self.sample_num_channels, self.num_frames, self.height, self.width],
framework=framework,
dtype=float_dtype,
)
if input_name == "latent_sample":
return self.random_float_tensor(
[self.batch_size, self.latent_num_channels, self.num_frames, self.height, self.width],
framework=framework,
dtype=float_dtype,
)
if input_name == "timestep":
return self.random_int_tensor([1], max_value=20, min_value=1, framework=framework, dtype=int_dtype)
# Export timestep as float and keep batch-dynamic mapping in decoder config.
return self.random_float_tensor([self.batch_size], framework=framework, dtype=float_dtype)

return super().generate(input_name, framework, int_dtype, float_dtype)

Expand All @@ -2838,14 +2850,15 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
@register_in_tasks_manager("ltx-vae-decoder", *["semantic-segmentation"], library_name="diffusers")
class LTXVaeDecoderOpenVINOConfig(VaeDecoderOnnxConfig):
DUMMY_INPUT_GENERATOR_CLASSES = (LTXVaeDummyInputGenerator,)
_MODEL_PATCHER = LTXVaeDecoderModelPatcher

@property
def inputs(self) -> Dict[str, Dict[int, str]]:
base_input = {
"latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"},
}
if self._normalized_config.config.timestep_conditioning:
base_input["timestep"] = {}
base_input["timestep"] = {0: "batch_size"}
return base_input

@property
Expand Down
35 changes: 35 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3495,6 +3495,41 @@ def __exit__(self, exc_type, exc_value, traceback):
self._model.pos_embed.forward = self._model.pos_embed._orig_forward


def _ltx_vae_decoder_forward(model, latent_sample, timestep=None):
if timestep is not None:
if timestep.ndim == 0:
timestep = timestep.reshape(1)
elif timestep.ndim > 1:
timestep = timestep.reshape(-1)

batch_size = latent_sample.shape[0]
if timestep.shape[0] != batch_size:
if timestep.shape[0] == 1:
timestep = timestep.expand(batch_size)
else:
timestep = timestep[:1].expand(batch_size)

timestep = timestep.to(dtype=latent_sample.dtype)

return model.decode(z=latent_sample, temb=timestep)


class LTXVaeDecoderModelPatcher(ModelPatcher):
def __init__(
self,
config: "OnnxConfig",
model: "PreTrainedModel",
model_kwargs: Optional[Dict[str, Any]] = None,
):
super().__init__(config, model, model_kwargs)

@functools.wraps(self.orig_forward)
def patched_forward(latent_sample, timestep=None):
return _ltx_vae_decoder_forward(self._model, latent_sample, timestep)

self.patched_forward = patched_forward


def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask):
bs = image_feature.shape[0]
image_feature = self.kv_proj(image_feature) # B * L * D
Expand Down
45 changes: 45 additions & 0 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1662,6 +1662,51 @@ class OVLTXPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LTXPipel
export_feature = "text-to-video"
auto_model_class = LTXPipeline

@staticmethod
def _expand_decode_condition(value, batch_size: int, num_videos_per_prompt: int):
effective_batch_size = batch_size * num_videos_per_prompt

if isinstance(value, tuple):
value = list(value)

if isinstance(value, list):
if len(value) == effective_batch_size:
return value
if len(value) == batch_size:
return [item for item in value for _ in range(num_videos_per_prompt)]
if len(value) == 1:
return value * effective_batch_size

return [value] * effective_batch_size

def __call__(self, *args, **kwargs):
prompt = kwargs.get("prompt", args[0] if args else None)
prompt_embeds = kwargs.get("prompt_embeds")
num_videos_per_prompt = kwargs.get("num_videos_per_prompt", 1) or 1

if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
elif prompt_embeds is not None:
batch_size = prompt_embeds.shape[0]
else:
batch_size = 1

if kwargs.get("decode_timestep") is None:
kwargs["decode_timestep"] = self._expand_decode_condition(0.0, batch_size, num_videos_per_prompt)
else:
kwargs["decode_timestep"] = self._expand_decode_condition(
kwargs["decode_timestep"], batch_size, num_videos_per_prompt
)

if kwargs.get("decode_noise_scale") is not None:
kwargs["decode_noise_scale"] = self._expand_decode_condition(
kwargs["decode_noise_scale"], batch_size, num_videos_per_prompt
)

return super().__call__(*args, **kwargs)


SUPPORTED_OV_PIPELINES = [
OVStableDiffusionPipeline,
Expand Down
2 changes: 1 addition & 1 deletion tests/openvino/test_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ def test_textual_inversion(self):
class OVPipelineForText2VideoTest(unittest.TestCase):
SUPPORTED_ARCHITECTURES = []
if is_diffusers_version(">=", "0.28.2"):
SUPPORTED_ARCHITECTURES.extend(["ltx-video"])
SUPPORTED_ARCHITECTURES.extend(["ltx-video", "ltx-video-0.9.1"])

OVMODEL_CLASS = OVPipelineForText2Video
AUTOMODEL_CLASS = DiffusionPipeline
Expand Down
2 changes: 2 additions & 0 deletions tests/openvino/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class ExportModelTest(unittest.TestCase):
"stable-diffusion-3": OVStableDiffusion3Pipeline,
"flux": OVFluxPipeline,
"ltx-video": OVLTXPipeline,
"ltx-video-0.9.1": OVLTXPipeline,
}

if is_transformers_version(">=", "4.48.0"):
Expand Down Expand Up @@ -133,6 +134,7 @@ class ExportModelTest(unittest.TestCase):
"flux": {"text_encoder_2": "8.0", "transformer": "8.0", "vae_encoder": "8.0", "vae_decoder": "8.0"},
"stable-diffusion-xl-refiner": {"vae_encoder": "128.0", "vae_decoder": "128.0"},
"ltx-video": {"text_encoder": "8.0", "vae_encoder": "8.0", "vae_decoder": "8.0"},
"ltx-video-0.9.1": {"text_encoder": "8.0", "vae_encoder": "8.0", "vae_decoder": "8.0"},
}

if is_transformers_version(">=", "4.51"):
Expand Down
2 changes: 2 additions & 0 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class OVCLIExportTestCase(unittest.TestCase):
("inpainting", "flux-fill"),
("text-to-image", "sana"),
("text-to-video", "ltx-video"),
("text-to-video", "ltx-video-0.9.1"),
("feature-extraction", "sam"),
("text-to-audio", "speecht5"),
("zero-shot-image-classification", "clip"),
Expand Down Expand Up @@ -209,6 +210,7 @@ class OVCLIExportTestCase(unittest.TestCase):
"llava": 2,
"sana": 2,
"ltx-video": 2,
"ltx-video-0.9.1": 2,
"sam": 0, # no tokenizer
"speecht5": 2,
"clip": 2,
Expand Down
1 change: 1 addition & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@
"sana": "optimum-intel-internal-testing/tiny-random-sana",
"sana-sprint": "optimum-intel-internal-testing/tiny-random-sana-sprint",
"ltx-video": "optimum-intel-internal-testing/tiny-random-ltx-video",
"ltx-video-0.9.1": "creeper-hat/tiny-random-ltx-video-0.9.1",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anatyrova, please help to upload this tiny model into optimum-intel-internal-testing

"zamba2": "optimum-intel-internal-testing/tiny-random-zamba2",
"qwen3_eagle3": "AngelSlim/Qwen3-1.7B_eagle3",
}
Expand Down