hao-ai-lab · HenryDzy · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/docs/inference/support_matrix.md b/docs/inference/support_matrix.md
@@ -48,6 +48,7 @@ pipeline initialization and sampling.
 
 | Model Name | HuggingFace Model ID | Resolutions | TeaCache | Sliding Tile Attn | Sage Attn | VSA | BSA |
 |------------|---------------------|-------------|----------|-------------------|-----------|-----|-----|
+| Ovis-Image 7B | `AIDC-AI/Ovis-Image-7B` | 1024×1024 | ⭕ | ⭕ | ⭕ | ⭕ | ⭕ |
 | FastWan2.1 T2V 1.3B | `FastVideo/FastWan2.1-T2V-1.3B-Diffusers` | 480P | ⭕ | ⭕ | ⭕ | ✅ | ⭕ |
 | FastWan2.2 TI2V 5B Full Attn* | `FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers` | 720P | ⭕ | ⭕ | ⭕ | ✅ | ⭕ |
 | Wan2.2 TI2V 5B | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | 720P | ⭕ | ⭕ | ✅ | ⭕ | ⭕ |

diff --git a/examples/inference/basic/basic_ovis_image.py b/examples/inference/basic/basic_ovis_image.py
@@ -0,0 +1,104 @@
+"""
+Ovis-Image Text-to-Image Generation Example
+
+This example demonstrates how to use the Ovis-Image-7B model for high-quality
+text-to-image generation, especially for text rendering in images.
+
+Ovis-Image excels at:
+- Text rendering in posters, banners, logos
+- UI mockups with readable text
+- Infographics with correct spelling
+- Bilingual text rendering
+"""
+
+from fastvideo import VideoGenerator
+
+OUTPUT_PATH = "ovis_image_samples"
+
+
+def main():
+    # Load Ovis-Image model
+    # Using local path to the downloaded model
+    generator = VideoGenerator.from_pretrained(
+        "AIDC-AI/Ovis-Image-7B",
+        # FastVideo will automatically handle distributed setup
+        num_gpus=1,
+        use_fsdp_inference=False,
+        dit_cpu_offload=False,
+        vae_cpu_offload=False,
+        text_encoder_cpu_offload=False,  # Qwen3 encoder
+        pin_cpu_memory=True,
+    )
+
+    # Example 1: Text rendering in a poster
+    prompt1 = (
+        'A creative 3D artistic render where the text "OVIS-IMAGE" is written '
+        'in a bold, expressive handwritten brush style using thick, wet oil paint. '
+        'The paint is a mix of vibrant rainbow colors (red, blue, yellow) swirling '
+        'together like toothpaste or impasto art. You can see the ridges of the brush '
+        'bristles and the glossy, wet texture of the paint. The background is a clean '
+        "artist's canvas. Dynamic lighting creates soft shadows behind the floating "
+        'paint strokes. Colorful, expressive, tactile texture, 4k detail.'
+    )
+
+    print(f"Generating image 1: Text rendering poster...")
+    image1 = generator.generate_video(
+        prompt1,
+        output_path=OUTPUT_PATH,
+        save_video=True,
+        num_frames=1,  # Single image for T2I
+        height=1024,
+        width=1024,
+        num_inference_steps=50,
+        guidance_scale=5.0,
+    )
+
+    # Example 2: UI mockup with text
+    prompt2 = (
+        'A modern mobile app interface mockup showing a weather app. '
+        'At the top, display "Weather Today" in clean sans-serif font. '
+        'Below show the temperature "72°F" in large numbers. '
+        'Include labeled sections: "Humidity: 65%", "Wind: 12 mph", '
+        'and "Forecast: Sunny". Use a gradient blue background with '
+        'white text. Minimalist design, professional UI/UX, high resolution.'
+    )
+
+    print(f"Generating image 2: UI mockup...")
+    image2 = generator.generate_video(
+        prompt2,
+        output_path=OUTPUT_PATH,
+        save_video=True,
+        num_frames=1,  # Single image for T2I
+        height=1024,
+        width=768,  # Portrait orientation for mobile
+        num_inference_steps=50,
+        guidance_scale=5.0,
+    )
+
+    # Example 3: Logo with text
+    prompt3 = (
+        'A professional tech startup logo featuring the text "FAST AI" '
+        'in bold, modern geometric font. The letters are metallic silver '
+        'with a subtle blue glow effect. Below in smaller text: '
+        '"Innovation through Technology". Clean white background, '
+        'minimalist design, corporate branding style, vector-like quality.'
+    )
+
+    print(f"Generating image 3: Logo design...")
+    image3 = generator.generate_video(
+        prompt3,
+        output_path=OUTPUT_PATH,
+        save_video=True,
+        num_frames=1,  # Single image for T2I
+        height=512,
+        width=512,  # Square for logo
+        num_inference_steps=50,
+        guidance_scale=5.0,
+    )
+
+    print(f"\nAll images saved to {OUTPUT_PATH}/")
+    print("Ovis-Image generation complete!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fastvideo/configs/models/dits/__init__.py b/fastvideo/configs/models/dits/__init__.py
@@ -8,10 +8,11 @@
 from fastvideo.configs.models.dits.stepvideo import StepVideoConfig
 from fastvideo.configs.models.dits.wanvideo import WanVideoConfig
 from fastvideo.configs.models.dits.hyworld import HYWorldConfig
+from fastvideo.configs.models.dits.ovisimage import OvisImageTransformer2DModelConfig
 
 __all__ = [
     "HunyuanVideoConfig", "HunyuanVideo15Config", "HunyuanGameCraftConfig",
-    "WanVideoConfig", "StepVideoConfig", "CosmosVideoConfig",
-    "Cosmos25VideoConfig", "LongCatVideoConfig", "LTX2VideoConfig",
-    "HYWorldConfig"
+    "WanVideoConfig", "StepVideoConfig", "CosmosVideoConfig", 
+    "Cosmos25VideoConfig", "LongCatVideoConfig", "LTX2VideoConfig", 
+    "HYWorldConfig", "OvisImageTransformer2DModelConfig"
 ]
diff --git a/fastvideo/configs/models/dits/ovisimage.py b/fastvideo/configs/models/dits/ovisimage.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Configuration for OvisImageTransformer2DModel"""
+
+from dataclasses import dataclass, field
+
+from fastvideo.configs.models.dits.base import DiTArchConfig, DiTConfig
+
+
+def _is_double_block(n: str, m) -> bool:
+    """Match transformer_blocks.{i} (double-stream blocks)."""
+    return "transformer_blocks" in n and "single" not in n and str.isdigit(
+        n.split(".")[-1])
+
+
+def _is_single_block(n: str, m) -> bool:
+    """Match single_transformer_blocks.{i} (single-stream blocks)."""
+    return "single_transformer_blocks" in n and str.isdigit(n.split(".")[-1])
+
+
+@dataclass
+class OvisImageTransformer2DModelArchConfig(DiTArchConfig):
+    """Architecture configuration for OvisImageTransformer2DModel."""
+
+    # Core architecture
+    hidden_size: int = 3072  # num_attention_heads * attention_head_dim = 24 * 128
+    num_attention_heads: int = 24
+    attention_head_dim: int = 128
+    num_layers: int = 6  # Number of joint (double) layers
+    num_single_layers: int = 27  # Number of single layers
+
+    # Input/output configuration
+    in_channels: int = 64
+    out_channels: int | None = None  # Can be None, defaults to in_channels
+    patch_size: int = 1
+
+    # Dimensions
+    joint_attention_dim: int = 2048  # Context dimension from text encoder
+    axes_dims_rope: list[int] = field(default_factory=lambda: [16, 56, 56])
+
+    # Legacy fields from base DiTArchConfig
+    num_channels_latents: int = 16  # VAE latent channels (in_channels=64 is packed=16*4)
+
+    # FSDP: shard double and single transformer blocks
+    _fsdp_shard_conditions: list = field(
+        default_factory=lambda: [_is_double_block, _is_single_block])
+
+    # Compile: same as FSDP for now
+    _compile_conditions: list = field(
+        default_factory=lambda: [_is_double_block, _is_single_block])
+
+    # Weight name mapping: identity (native attrs match HF attrs)
+    param_names_mapping: dict = field(default_factory=dict)
+    reverse_param_names_mapping: dict = field(default_factory=dict)
+    lora_param_names_mapping: dict = field(default_factory=dict)
+
+
+@dataclass
+class OvisImageTransformer2DModelConfig(DiTConfig):
+    """Configuration for Ovis-Image DiT."""
+
+    arch_config: DiTArchConfig = field(
+        default_factory=OvisImageTransformer2DModelArchConfig)
+    prefix: str = "OvisImage"
diff --git a/fastvideo/configs/models/encoders/__init__.py b/fastvideo/configs/models/encoders/__init__.py
@@ -7,6 +7,7 @@
 from fastvideo.configs.models.encoders.llama import LlamaConfig
 from fastvideo.configs.models.encoders.t5 import T5Config, T5LargeConfig
 from fastvideo.configs.models.encoders.qwen2_5 import Qwen2_5_VLConfig
+from fastvideo.configs.models.encoders.qwen3 import Qwen3ArchConfig, Qwen3Config
 from fastvideo.configs.models.encoders.siglip import SiglipVisionConfig
 from fastvideo.configs.models.encoders.reason1 import Reason1ArchConfig, Reason1Config
 from fastvideo.configs.models.encoders.gemma import LTX2GemmaConfig
@@ -15,6 +16,6 @@
     "EncoderConfig", "TextEncoderConfig", "ImageEncoderConfig",
     "BaseEncoderOutput", "CLIPTextConfig", "CLIPVisionConfig",
     "WAN2_1ControlCLIPVisionConfig", "LlamaConfig", "T5Config", "T5LargeConfig",
-    "Qwen2_5_VLConfig", "Reason1ArchConfig", "Reason1Config", "LTX2GemmaConfig",
-    "SiglipVisionConfig"
+    "Qwen2_5_VLConfig", "Qwen3ArchConfig", "Qwen3Config", "Reason1ArchConfig",
+    "Reason1Config", "LTX2GemmaConfig", "SiglipVisionConfig"
 ]
diff --git a/fastvideo/configs/models/encoders/qwen3.py b/fastvideo/configs/models/encoders/qwen3.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass, field
+
+from fastvideo.configs.models.encoders.base import (TextEncoderArchConfig,
+                                                    TextEncoderConfig)
+
+
+def _is_transformer_layer(n: str, m) -> bool:
+    return "layers" in n and str.isdigit(n.split(".")[-1])
+
+
+def _is_embeddings(n: str, m) -> bool:
+    return n.endswith("embed_tokens")
+
+
+def _is_final_norm(n: str, m) -> bool:
+    return n.endswith("norm")
+
+
+@dataclass
+class Qwen3ArchConfig(TextEncoderArchConfig):
+    """Architecture config for Qwen3 text encoder (used in Ovis-Image)."""
+
+    # Model architecture - defaults from Ovis2.5-2B (Qwen3-2B)
+    vocab_size: int = 151936
+    hidden_size: int = 2048
+    intermediate_size: int = 6144  # Actual value from Ovis2.5-2B
+    num_hidden_layers: int = 28  # Actual value from Ovis2.5-2B
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8  # Actual value from Ovis2.5-2B
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 40960  # Actual value from Ovis2.5-2B
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-06
+    use_cache: bool = True
+    tie_word_embeddings: bool = True
+    rope_theta: float = 1000000.0
+    rope_scaling: dict | None = None
+    use_sliding_window: bool = False
+    sliding_window: int | None = None  # Can be None
+    max_window_layers: int = 28  # Actual value from Ovis2.5-2B
+    attention_dropout: float = 0.0
+    attention_bias: bool = False
+    head_dim: int = 128
+
+    # HuggingFace transformers fields
+    bos_token_id: int = 151643
+    eos_token_id: int = 151645
+    dtype: str = "float32"
+    _attn_implementation_autoset: bool = True
+    layer_types: list[str] = field(
+        default_factory=lambda: ["full_attention"] * 28)
+
+    # FastVideo-specific settings
+    hidden_state_skip_layer: int = 0
+    text_len: int = 256
+
+    # Ovis-Image uses system prompt tokens (28 tokens) prepended to user tokens
+    user_prompt_begin_id: int = 28
+
+    # Qwen3-specific stacked params
+    stacked_params_mapping: list[tuple[str, str, str]] = field(
+        default_factory=lambda: [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),  # type: ignore
+            (".gate_up_proj", ".up_proj", 1),  # type: ignore
+        ])
+
+    _fsdp_shard_conditions: list = field(
+        default_factory=lambda:
+        [_is_transformer_layer, _is_embeddings, _is_final_norm])
+
+    def __post_init__(self):
+        super().__post_init__()
+        # Override tokenizer_kwargs for apply_chat_template
+        # Ovis-Image uses chat template with system prompt (28 tokens prepended)
+        # Total max_length = text_len + user_prompt_begin_id
+        self.tokenizer_kwargs = {
+            "add_generation_prompt": True,
+            "tokenize": True,
+            "return_dict": True,
+            "padding": "max_length",
+            "max_length": self.text_len + self.user_prompt_begin_id,
+            "truncation": True,
+            "return_tensors": "pt",
+            "enable_thinking": False,
+        }
+
+
+@dataclass
+class Qwen3Config(TextEncoderConfig):
+    """Configuration for Qwen3 text encoder."""
+
+    arch_config: TextEncoderArchConfig = field(default_factory=Qwen3ArchConfig)
+    prefix: str = "qwen3"
+    is_chat_model: bool = True
diff --git a/fastvideo/configs/models/vaes/base.py b/fastvideo/configs/models/vaes/base.py
@@ -17,6 +17,26 @@ class VAEArchConfig(ArchConfig):
     temporal_compression_ratio: int = 4
     spatial_compression_ratio: int = 8
 
+    # Additional fields from diffusers AutoencoderKL
+    act_fn: str = "silu"
+    block_out_channels: list[int] = field(
+        default_factory=lambda: [128, 256, 512, 512])
+    down_block_types: list[str] = field(default_factory=list)
+    up_block_types: list[str] = field(default_factory=list)
+    force_upcast: bool = False
+    in_channels: int = 3
+    latent_channels: int = 16
+    latents_mean: list[float] | None = None
+    latents_std: list[float] | None = None
+    layers_per_block: int = 2
+    mid_block_add_attention: bool = True
+    norm_num_groups: int = 32
+    out_channels: int = 3
+    sample_size: int = 1024
+    shift_factor: float | None = None
+    use_post_quant_conv: bool = False
+    use_quant_conv: bool = False
+
 
 @dataclass
 class VAEConfig(ModelConfig):

diff --git a/fastvideo/configs/ovis_image_7b_t2i_pipeline.json b/fastvideo/configs/ovis_image_7b_t2i_pipeline.json
@@ -0,0 +1,37 @@
+{
+  "embedded_cfg_scale": 5.0,
+  "flow_shift": 3.0,
+  "dit_cpu_offload": false,
+  "disable_autocast": false,
+  "precision": "bf16",
+  "vae_precision": "fp32",
+  "vae_tiling": true,
+  "vae_sp": false,
+  "vae_config": {
+    "load_encoder": false,
+    "load_decoder": true,
+    "tile_sample_min_height": 256,
+    "tile_sample_min_width": 256,
+    "tile_sample_stride_height": 192,
+    "tile_sample_stride_width": 192,
+    "use_tiling": true,
+    "use_temporal_tiling": false,
+    "use_parallel_tiling": false,
+    "use_feature_cache": true
+  },
+  "dit_config": {
+    "prefix": "OvisImage",
+    "quant_config": null
+  },
+  "text_encoder_precisions": [
+    "bf16"
+  ],
+  "text_encoder_configs": [
+    {
+      "prefix": "qwen3",
+      "quant_config": null,
+      "lora_config": null
+    }
+  ],
+  "enable_torch_compile": false
+}