Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/inference/support_matrix.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pipeline initialization and sampling.

| Model Name | HuggingFace Model ID | Resolutions | TeaCache | Sliding Tile Attn | Sage Attn | VSA | BSA |
|------------|---------------------|-------------|----------|-------------------|-----------|-----|-----|
| Ovis-Image 7B | `AIDC-AI/Ovis-Image-7B` | 1024×1024 | ⭕ | ⭕ | ⭕ | ⭕ | ⭕ |
| FastWan2.1 T2V 1.3B | `FastVideo/FastWan2.1-T2V-1.3B-Diffusers` | 480P | ⭕ | ⭕ | ⭕ | ✅ | ⭕ |
| FastWan2.2 TI2V 5B Full Attn* | `FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers` | 720P | ⭕ | ⭕ | ⭕ | ✅ | ⭕ |
| Wan2.2 TI2V 5B | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | 720P | ⭕ | ⭕ | ✅ | ⭕ | ⭕ |
Expand Down
104 changes: 104 additions & 0 deletions examples/inference/basic/basic_ovis_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""
Ovis-Image Text-to-Image Generation Example

This example demonstrates how to use the Ovis-Image-7B model for high-quality
text-to-image generation, especially for text rendering in images.

Ovis-Image excels at:
- Text rendering in posters, banners, logos
- UI mockups with readable text
- Infographics with correct spelling
- Bilingual text rendering
"""

from fastvideo import VideoGenerator

OUTPUT_PATH = "ovis_image_samples"


def main():
# Load Ovis-Image model
# Using local path to the downloaded model
generator = VideoGenerator.from_pretrained(
"AIDC-AI/Ovis-Image-7B",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False,
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=False, # Qwen3 encoder
pin_cpu_memory=True,
)

# Example 1: Text rendering in a poster
prompt1 = (
'A creative 3D artistic render where the text "OVIS-IMAGE" is written '
'in a bold, expressive handwritten brush style using thick, wet oil paint. '
'The paint is a mix of vibrant rainbow colors (red, blue, yellow) swirling '
'together like toothpaste or impasto art. You can see the ridges of the brush '
'bristles and the glossy, wet texture of the paint. The background is a clean '
"artist's canvas. Dynamic lighting creates soft shadows behind the floating "
'paint strokes. Colorful, expressive, tactile texture, 4k detail.'
)

print(f"Generating image 1: Text rendering poster...")
image1 = generator.generate_video(
prompt1,
output_path=OUTPUT_PATH,
save_video=True,
num_frames=1, # Single image for T2I
height=1024,
width=1024,
num_inference_steps=50,
guidance_scale=5.0,
)

# Example 2: UI mockup with text
prompt2 = (
'A modern mobile app interface mockup showing a weather app. '
'At the top, display "Weather Today" in clean sans-serif font. '
'Below show the temperature "72°F" in large numbers. '
'Include labeled sections: "Humidity: 65%", "Wind: 12 mph", '
'and "Forecast: Sunny". Use a gradient blue background with '
'white text. Minimalist design, professional UI/UX, high resolution.'
)

print(f"Generating image 2: UI mockup...")
image2 = generator.generate_video(
prompt2,
output_path=OUTPUT_PATH,
save_video=True,
num_frames=1, # Single image for T2I
height=1024,
width=768, # Portrait orientation for mobile
num_inference_steps=50,
guidance_scale=5.0,
)

# Example 3: Logo with text
prompt3 = (
'A professional tech startup logo featuring the text "FAST AI" '
'in bold, modern geometric font. The letters are metallic silver '
'with a subtle blue glow effect. Below in smaller text: '
'"Innovation through Technology". Clean white background, '
'minimalist design, corporate branding style, vector-like quality.'
)

print(f"Generating image 3: Logo design...")
image3 = generator.generate_video(
prompt3,
output_path=OUTPUT_PATH,
save_video=True,
num_frames=1, # Single image for T2I
height=512,
width=512, # Square for logo
num_inference_steps=50,
guidance_scale=5.0,
)

print(f"\nAll images saved to {OUTPUT_PATH}/")
print("Ovis-Image generation complete!")


if __name__ == "__main__":
main()
7 changes: 4 additions & 3 deletions fastvideo/configs/models/dits/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from fastvideo.configs.models.dits.stepvideo import StepVideoConfig
from fastvideo.configs.models.dits.wanvideo import WanVideoConfig
from fastvideo.configs.models.dits.hyworld import HYWorldConfig
from fastvideo.configs.models.dits.ovisimage import OvisImageTransformer2DModelConfig

__all__ = [
"HunyuanVideoConfig", "HunyuanVideo15Config", "HunyuanGameCraftConfig",
"WanVideoConfig", "StepVideoConfig", "CosmosVideoConfig",
"Cosmos25VideoConfig", "LongCatVideoConfig", "LTX2VideoConfig",
"HYWorldConfig"
"WanVideoConfig", "StepVideoConfig", "CosmosVideoConfig",
"Cosmos25VideoConfig", "LongCatVideoConfig", "LTX2VideoConfig",
"HYWorldConfig", "OvisImageTransformer2DModelConfig"
]
63 changes: 63 additions & 0 deletions fastvideo/configs/models/dits/ovisimage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
"""Configuration for OvisImageTransformer2DModel"""

from dataclasses import dataclass, field

from fastvideo.configs.models.dits.base import DiTArchConfig, DiTConfig


def _is_double_block(n: str, m) -> bool:
"""Match transformer_blocks.{i} (double-stream blocks)."""
return "transformer_blocks" in n and "single" not in n and str.isdigit(
n.split(".")[-1])


def _is_single_block(n: str, m) -> bool:
"""Match single_transformer_blocks.{i} (single-stream blocks)."""
return "single_transformer_blocks" in n and str.isdigit(n.split(".")[-1])


@dataclass
class OvisImageTransformer2DModelArchConfig(DiTArchConfig):
"""Architecture configuration for OvisImageTransformer2DModel."""

# Core architecture
hidden_size: int = 3072 # num_attention_heads * attention_head_dim = 24 * 128
num_attention_heads: int = 24
attention_head_dim: int = 128
num_layers: int = 6 # Number of joint (double) layers
num_single_layers: int = 27 # Number of single layers

# Input/output configuration
in_channels: int = 64
out_channels: int | None = None # Can be None, defaults to in_channels
patch_size: int = 1

# Dimensions
joint_attention_dim: int = 2048 # Context dimension from text encoder
axes_dims_rope: list[int] = field(default_factory=lambda: [16, 56, 56])

# Legacy fields from base DiTArchConfig
num_channels_latents: int = 16 # VAE latent channels (in_channels=64 is packed=16*4)

# FSDP: shard double and single transformer blocks
_fsdp_shard_conditions: list = field(
default_factory=lambda: [_is_double_block, _is_single_block])

# Compile: same as FSDP for now
_compile_conditions: list = field(
default_factory=lambda: [_is_double_block, _is_single_block])

# Weight name mapping: identity (native attrs match HF attrs)
param_names_mapping: dict = field(default_factory=dict)
reverse_param_names_mapping: dict = field(default_factory=dict)
lora_param_names_mapping: dict = field(default_factory=dict)


@dataclass
class OvisImageTransformer2DModelConfig(DiTConfig):
"""Configuration for Ovis-Image DiT."""

arch_config: DiTArchConfig = field(
default_factory=OvisImageTransformer2DModelArchConfig)
prefix: str = "OvisImage"
5 changes: 3 additions & 2 deletions fastvideo/configs/models/encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from fastvideo.configs.models.encoders.llama import LlamaConfig
from fastvideo.configs.models.encoders.t5 import T5Config, T5LargeConfig
from fastvideo.configs.models.encoders.qwen2_5 import Qwen2_5_VLConfig
from fastvideo.configs.models.encoders.qwen3 import Qwen3ArchConfig, Qwen3Config
from fastvideo.configs.models.encoders.siglip import SiglipVisionConfig
from fastvideo.configs.models.encoders.reason1 import Reason1ArchConfig, Reason1Config
from fastvideo.configs.models.encoders.gemma import LTX2GemmaConfig
Expand All @@ -15,6 +16,6 @@
"EncoderConfig", "TextEncoderConfig", "ImageEncoderConfig",
"BaseEncoderOutput", "CLIPTextConfig", "CLIPVisionConfig",
"WAN2_1ControlCLIPVisionConfig", "LlamaConfig", "T5Config", "T5LargeConfig",
"Qwen2_5_VLConfig", "Reason1ArchConfig", "Reason1Config", "LTX2GemmaConfig",
"SiglipVisionConfig"
"Qwen2_5_VLConfig", "Qwen3ArchConfig", "Qwen3Config", "Reason1ArchConfig",
"Reason1Config", "LTX2GemmaConfig", "SiglipVisionConfig"
]
99 changes: 99 additions & 0 deletions fastvideo/configs/models/encoders/qwen3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass, field

from fastvideo.configs.models.encoders.base import (TextEncoderArchConfig,
TextEncoderConfig)


def _is_transformer_layer(n: str, m) -> bool:
return "layers" in n and str.isdigit(n.split(".")[-1])


def _is_embeddings(n: str, m) -> bool:
return n.endswith("embed_tokens")


def _is_final_norm(n: str, m) -> bool:
return n.endswith("norm")


@dataclass
class Qwen3ArchConfig(TextEncoderArchConfig):
"""Architecture config for Qwen3 text encoder (used in Ovis-Image)."""

# Model architecture - defaults from Ovis2.5-2B (Qwen3-2B)
vocab_size: int = 151936
hidden_size: int = 2048
intermediate_size: int = 6144 # Actual value from Ovis2.5-2B
num_hidden_layers: int = 28 # Actual value from Ovis2.5-2B
num_attention_heads: int = 16
num_key_value_heads: int = 8 # Actual value from Ovis2.5-2B
hidden_act: str = "silu"
max_position_embeddings: int = 40960 # Actual value from Ovis2.5-2B
initializer_range: float = 0.02
rms_norm_eps: float = 1e-06
use_cache: bool = True
tie_word_embeddings: bool = True
rope_theta: float = 1000000.0
rope_scaling: dict | None = None
use_sliding_window: bool = False
sliding_window: int | None = None # Can be None
max_window_layers: int = 28 # Actual value from Ovis2.5-2B
attention_dropout: float = 0.0
attention_bias: bool = False
head_dim: int = 128

# HuggingFace transformers fields
bos_token_id: int = 151643
eos_token_id: int = 151645
dtype: str = "float32"
_attn_implementation_autoset: bool = True
layer_types: list[str] = field(
default_factory=lambda: ["full_attention"] * 28)

# FastVideo-specific settings
hidden_state_skip_layer: int = 0
text_len: int = 256

# Ovis-Image uses system prompt tokens (28 tokens) prepended to user tokens
user_prompt_begin_id: int = 28

# Qwen3-specific stacked params
stacked_params_mapping: list[tuple[str, str, str]] = field(
default_factory=lambda: [
# (param_name, shard_name, shard_id)
(".qkv_proj", ".q_proj", "q"),
(".qkv_proj", ".k_proj", "k"),
(".qkv_proj", ".v_proj", "v"),
(".gate_up_proj", ".gate_proj", 0), # type: ignore
(".gate_up_proj", ".up_proj", 1), # type: ignore
Comment thread
HenryDzy marked this conversation as resolved.
])

_fsdp_shard_conditions: list = field(
default_factory=lambda:
[_is_transformer_layer, _is_embeddings, _is_final_norm])

def __post_init__(self):
super().__post_init__()
# Override tokenizer_kwargs for apply_chat_template
# Ovis-Image uses chat template with system prompt (28 tokens prepended)
# Total max_length = text_len + user_prompt_begin_id
self.tokenizer_kwargs = {
"add_generation_prompt": True,
"tokenize": True,
"return_dict": True,
"padding": "max_length",
"max_length": self.text_len + self.user_prompt_begin_id,
"truncation": True,
"return_tensors": "pt",
"enable_thinking": False,
}


@dataclass
class Qwen3Config(TextEncoderConfig):
"""Configuration for Qwen3 text encoder."""

arch_config: TextEncoderArchConfig = field(default_factory=Qwen3ArchConfig)
prefix: str = "qwen3"
is_chat_model: bool = True
20 changes: 20 additions & 0 deletions fastvideo/configs/models/vaes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,26 @@ class VAEArchConfig(ArchConfig):
temporal_compression_ratio: int = 4
spatial_compression_ratio: int = 8

# Additional fields from diffusers AutoencoderKL
act_fn: str = "silu"
block_out_channels: list[int] = field(
default_factory=lambda: [128, 256, 512, 512])
down_block_types: list[str] = field(default_factory=list)
up_block_types: list[str] = field(default_factory=list)
force_upcast: bool = False
in_channels: int = 3
latent_channels: int = 16
latents_mean: list[float] | None = None
latents_std: list[float] | None = None
layers_per_block: int = 2
mid_block_add_attention: bool = True
norm_num_groups: int = 32
out_channels: int = 3
sample_size: int = 1024
shift_factor: float | None = None
use_post_quant_conv: bool = False
use_quant_conv: bool = False


@dataclass
class VAEConfig(ModelConfig):
Expand Down
37 changes: 37 additions & 0 deletions fastvideo/configs/ovis_image_7b_t2i_pipeline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"embedded_cfg_scale": 5.0,
"flow_shift": 3.0,
"dit_cpu_offload": false,
"disable_autocast": false,
"precision": "bf16",
"vae_precision": "fp32",
"vae_tiling": true,
"vae_sp": false,
"vae_config": {
"load_encoder": false,
"load_decoder": true,
"tile_sample_min_height": 256,
"tile_sample_min_width": 256,
"tile_sample_stride_height": 192,
"tile_sample_stride_width": 192,
"use_tiling": true,
"use_temporal_tiling": false,
"use_parallel_tiling": false,
"use_feature_cache": true
},
"dit_config": {
"prefix": "OvisImage",
"quant_config": null
},
"text_encoder_precisions": [
"bf16"
],
"text_encoder_configs": [
{
"prefix": "qwen3",
"quant_config": null,
"lora_config": null
}
],
"enable_torch_compile": false
}
Loading