Skip to content

Commit efee285

Browse files
author
Talmaj Marinc
committed
Add I2V for causal forcing model.
1 parent fc303cb commit efee285

2 files changed

Lines changed: 69 additions & 0 deletions

File tree

comfy/k_diffusion/sampling.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,6 +1859,23 @@ def sample_ar_video(model, x, sigmas, extra_args=None, callback=None, disable=No
18591859
output = torch.zeros_like(x)
18601860
s_in = x.new_ones([x.shape[0]])
18611861
current_start_frame = 0
1862+
1863+
# I2V: seed KV cache with the initial image latent before the denoising loop
1864+
initial_latent = ar_config.get("initial_latent", None)
1865+
if initial_latent is not None:
1866+
initial_latent = inner_model.process_latent_in(initial_latent).to(device=device, dtype=model_dtype)
1867+
n_init = initial_latent.shape[2]
1868+
output[:, :, :n_init] = initial_latent
1869+
1870+
ar_state = {"start_frame": 0, "kv_caches": kv_caches, "crossattn_caches": crossattn_caches}
1871+
transformer_options["ar_state"] = ar_state
1872+
zero_sigma = sigmas.new_zeros([1])
1873+
_ = model(initial_latent, zero_sigma * s_in, **extra_args)
1874+
1875+
current_start_frame = n_init
1876+
remaining = lat_t - n_init
1877+
num_blocks = -(-remaining // num_frame_per_block)
1878+
18621879
num_sigma_steps = len(sigmas) - 1
18631880
total_real_steps = num_blocks * num_sigma_steps
18641881
step_count = 0

comfy_extras/nodes_ar_video.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
ComfyUI nodes for autoregressive video generation (Causal Forcing, Self-Forcing, etc.).
33
- EmptyARVideoLatent: create 5D [B, C, T, H, W] video latent tensors
44
- SamplerARVideo: SAMPLER for the block-by-block autoregressive denoising loop
5+
- ARVideoI2V: image-to-video conditioning for AR models (seeds KV cache with start image)
56
"""
67

78
import torch
89
from typing_extensions import override
910

1011
import comfy.model_management
1112
import comfy.samplers
13+
import comfy.utils
1214
from comfy_api.latest import ComfyExtension, io
1315

1416

@@ -71,12 +73,62 @@ def execute(cls, num_frame_per_block) -> io.NodeOutput:
7173
return io.NodeOutput(comfy.samplers.ksampler("ar_video", extra_options))
7274

7375

76+
class ARVideoI2V(io.ComfyNode):
77+
"""Image-to-video setup for AR video models (Causal Forcing, Self-Forcing).
78+
79+
VAE-encodes the start image and stores it in the model's transformer_options
80+
so that sample_ar_video can seed the KV cache before denoising.
81+
Uses the same T2V model checkpoint -- no separate I2V architecture needed.
82+
"""
83+
84+
@classmethod
85+
def define_schema(cls):
86+
return io.Schema(
87+
node_id="ARVideoI2V",
88+
category="conditioning/video_models",
89+
inputs=[
90+
io.Model.Input("model"),
91+
io.Vae.Input("vae"),
92+
io.Image.Input("start_image"),
93+
io.Int.Input("width", default=832, min=16, max=8192, step=16),
94+
io.Int.Input("height", default=480, min=16, max=8192, step=16),
95+
io.Int.Input("length", default=81, min=1, max=1024, step=4),
96+
io.Int.Input("batch_size", default=1, min=1, max=64),
97+
],
98+
outputs=[
99+
io.Model.Output(display_name="MODEL"),
100+
io.Latent.Output(display_name="LATENT"),
101+
],
102+
)
103+
104+
@classmethod
105+
def execute(cls, model, vae, start_image, width, height, length, batch_size) -> io.NodeOutput:
106+
start_image = comfy.utils.common_upscale(
107+
start_image[:1].movedim(-1, 1), width, height, "bilinear", "center"
108+
).movedim(1, -1)
109+
110+
initial_latent = vae.encode(start_image[:, :, :, :3])
111+
112+
m = model.clone()
113+
to = m.model_options.setdefault("transformer_options", {})
114+
ar_cfg = to.setdefault("ar_config", {})
115+
ar_cfg["initial_latent"] = initial_latent
116+
117+
lat_t = ((length - 1) // 4) + 1
118+
latent = torch.zeros(
119+
[batch_size, 16, lat_t, height // 8, width // 8],
120+
device=comfy.model_management.intermediate_device(),
121+
)
122+
return io.NodeOutput(m, {"samples": latent})
123+
124+
74125
class ARVideoExtension(ComfyExtension):
75126
@override
76127
async def get_node_list(self) -> list[type[io.ComfyNode]]:
77128
return [
78129
EmptyARVideoLatent,
79130
SamplerARVideo,
131+
ARVideoI2V,
80132
]
81133

82134

0 commit comments

Comments
 (0)