huggingface · Yash-Vijay29 · Mar 23, 2026 · Mar 26, 2026 · Mar 27, 2026 · Apr 25, 2026
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -243,6 +243,14 @@ When Stable Diffusion models are exported to the OpenVINO format, they are decom
 * VAE encoder
 * VAE decoder
 
+For LTX-Video checkpoints with `timestep_conditioning` enabled (for example, LTX-Video 0.9.1), export now handles the VAE decoder conditioning path automatically by adding a dynamic `timestep` input when required by the checkpoint config.
+
+To export a text-to-video checkpoint such as LTX-Video:
+
+```bash
+optimum-cli export openvino --model <ltx-video-model-id> --task text-to-video ov_ltx_video/
+```
+
 To export your Stable Diffusion XL model to the OpenVINO IR format with the CLI you can do as follows:
 
 ```bash

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -180,7 +180,7 @@ Here is the list of the supported architectures :
 - Flux
 - Sana
 - SanaSprint
-- LTX
+- LTX-Video
 
 ## [Timm](https://huggingface.co/docs/timm/index)
 - PiT

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -178,6 +178,7 @@
     LlavaImageEmbeddingModelPatcher,
     LlavaNextVideoImageEmbeddingModelPatcher,
     LlavaQwen2ImageEmbeddingsModelPatcher,
+    LTXVaeDecoderModelPatcher,
     MairaImageEmbeddingModelPatcher,
     MambaPatcher,
     MarianModelPatcher,
@@ -2806,14 +2807,25 @@ def __init__(
     ):
         super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)
         self.num_frames = num_frames
+        self.sample_num_channels = getattr(normalized_config.config, "in_channels", self.num_channels)
+        self.latent_num_channels = getattr(normalized_config.config, "latent_channels", self.num_channels)
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name in ["sample", "latent_sample"]:
+        if input_name == "sample":
             return self.random_float_tensor(
-                [self.batch_size, self.num_channels, self.num_frames, self.height, self.width]
+                [self.batch_size, self.sample_num_channels, self.num_frames, self.height, self.width],
+                framework=framework,
+                dtype=float_dtype,
+            )
+        if input_name == "latent_sample":
+            return self.random_float_tensor(
+                [self.batch_size, self.latent_num_channels, self.num_frames, self.height, self.width],
+                framework=framework,
+                dtype=float_dtype,
             )
         if input_name == "timestep":
-            return self.random_int_tensor([1], max_value=20, min_value=1, framework=framework, dtype=int_dtype)
+            # Export timestep as float and keep batch-dynamic mapping in decoder config.
+            return self.random_float_tensor([self.batch_size], framework=framework, dtype=float_dtype)
 
         return super().generate(input_name, framework, int_dtype, float_dtype)
 
@@ -2838,14 +2850,15 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 @register_in_tasks_manager("ltx-vae-decoder", *["semantic-segmentation"], library_name="diffusers")
 class LTXVaeDecoderOpenVINOConfig(VaeDecoderOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (LTXVaeDummyInputGenerator,)
+    _MODEL_PATCHER = LTXVaeDecoderModelPatcher
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         base_input = {
             "latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"},
         }
         if self._normalized_config.config.timestep_conditioning:
-            base_input["timestep"] = {}
+            base_input["timestep"] = {0: "batch_size"}
         return base_input
 
     @property

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -3495,6 +3495,41 @@ def __exit__(self, exc_type, exc_value, traceback):
             self._model.pos_embed.forward = self._model.pos_embed._orig_forward
 
 
+def _ltx_vae_decoder_forward(model, latent_sample, timestep=None):
+    if timestep is not None:
+        if timestep.ndim == 0:
+            timestep = timestep.reshape(1)
+        elif timestep.ndim > 1:
+            timestep = timestep.reshape(-1)
+
+        batch_size = latent_sample.shape[0]
+        if timestep.shape[0] != batch_size:
+            if timestep.shape[0] == 1:
+                timestep = timestep.expand(batch_size)
+            else:
+                timestep = timestep[:1].expand(batch_size)
+
+        timestep = timestep.to(dtype=latent_sample.dtype)
+
+    return model.decode(z=latent_sample, temb=timestep)
+
+
+class LTXVaeDecoderModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        @functools.wraps(self.orig_forward)
+        def patched_forward(latent_sample, timestep=None):
+            return _ltx_vae_decoder_forward(self._model, latent_sample, timestep)
+
+        self.patched_forward = patched_forward
+
+
 def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask):
     bs = image_feature.shape[0]
     image_feature = self.kv_proj(image_feature)  # B * L * D

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -1662,6 +1662,51 @@ class OVLTXPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LTXPipel
     export_feature = "text-to-video"
     auto_model_class = LTXPipeline
 
+    @staticmethod
+    def _expand_decode_condition(value, batch_size: int, num_videos_per_prompt: int):
+        effective_batch_size = batch_size * num_videos_per_prompt
+
+        if isinstance(value, tuple):
+            value = list(value)
+
+        if isinstance(value, list):
+            if len(value) == effective_batch_size:
+                return value
+            if len(value) == batch_size:
+                return [item for item in value for _ in range(num_videos_per_prompt)]
+            if len(value) == 1:
+                return value * effective_batch_size
+
+        return [value] * effective_batch_size
+
+    def __call__(self, *args, **kwargs):
+        prompt = kwargs.get("prompt", args[0] if args else None)
+        prompt_embeds = kwargs.get("prompt_embeds")
+        num_videos_per_prompt = kwargs.get("num_videos_per_prompt", 1) or 1
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            batch_size = prompt_embeds.shape[0]
+        else:
+            batch_size = 1
+
+        if kwargs.get("decode_timestep") is None:
+            kwargs["decode_timestep"] = self._expand_decode_condition(0.0, batch_size, num_videos_per_prompt)
+        else:
+            kwargs["decode_timestep"] = self._expand_decode_condition(
+                kwargs["decode_timestep"], batch_size, num_videos_per_prompt
+            )
+
+        if kwargs.get("decode_noise_scale") is not None:
+            kwargs["decode_noise_scale"] = self._expand_decode_condition(
+                kwargs["decode_noise_scale"], batch_size, num_videos_per_prompt
+            )
+
+        return super().__call__(*args, **kwargs)
+
 
 SUPPORTED_OV_PIPELINES = [
     OVStableDiffusionPipeline,

diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
@@ -1050,7 +1050,7 @@ def test_textual_inversion(self):
 class OVPipelineForText2VideoTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = []
     if is_diffusers_version(">=", "0.28.2"):
-        SUPPORTED_ARCHITECTURES.extend(["ltx-video"])
+        SUPPORTED_ARCHITECTURES.extend(["ltx-video", "ltx-video-0.9.1"])
 
     OVMODEL_CLASS = OVPipelineForText2Video
     AUTOMODEL_CLASS = DiffusionPipeline

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
@@ -93,6 +93,7 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-3": OVStableDiffusion3Pipeline,
         "flux": OVFluxPipeline,
         "ltx-video": OVLTXPipeline,
+        "ltx-video-0.9.1": OVLTXPipeline,
     }
 
     if is_transformers_version(">=", "4.48.0"):
@@ -133,6 +134,7 @@ class ExportModelTest(unittest.TestCase):
         "flux": {"text_encoder_2": "8.0", "transformer": "8.0", "vae_encoder": "8.0", "vae_decoder": "8.0"},
         "stable-diffusion-xl-refiner": {"vae_encoder": "128.0", "vae_decoder": "128.0"},
         "ltx-video": {"text_encoder": "8.0", "vae_encoder": "8.0", "vae_decoder": "8.0"},
+        "ltx-video-0.9.1": {"text_encoder": "8.0", "vae_encoder": "8.0", "vae_decoder": "8.0"},
     }
 
     if is_transformers_version(">=", "4.51"):

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -111,6 +111,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("inpainting", "flux-fill"),
         ("text-to-image", "sana"),
         ("text-to-video", "ltx-video"),
+        ("text-to-video", "ltx-video-0.9.1"),
         ("feature-extraction", "sam"),
         ("text-to-audio", "speecht5"),
         ("zero-shot-image-classification", "clip"),
@@ -209,6 +210,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "llava": 2,
         "sana": 2,
         "ltx-video": 2,
+        "ltx-video-0.9.1": 2,
         "sam": 0,  # no tokenizer
         "speecht5": 2,
         "clip": 2,

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -231,6 +231,7 @@
     "sana": "optimum-intel-internal-testing/tiny-random-sana",
     "sana-sprint": "optimum-intel-internal-testing/tiny-random-sana-sprint",
     "ltx-video": "optimum-intel-internal-testing/tiny-random-ltx-video",
+    "ltx-video-0.9.1": "creeper-hat/tiny-random-ltx-video-0.9.1",
     "zamba2": "optimum-intel-internal-testing/tiny-random-zamba2",
     "qwen3_eagle3": "AngelSlim/Qwen3-1.7B_eagle3",
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -180,7 +180,7 @@ Here is the list of the supported architectures : @@
     - Flux
     - Sana
     - SanaSprint
-    - LTX
+    - LTX-Video
     ## [Timm](https://huggingface.co/docs/timm/index)
     - PiT
@@ Expand Down @@