|
| 1 | +import torch |
| 2 | + |
| 3 | +from lightx2v.models.networks.wan.infer.module_io import GridOutput |
| 4 | +from lightx2v.models.networks.wan.infer.pre_infer import WanPreInfer |
| 5 | + |
| 6 | +from .module_io import MotusPreInferModuleOutput |
| 7 | + |
| 8 | + |
| 9 | +class MotusPreInfer(WanPreInfer): |
| 10 | + def __init__(self, model, config): |
| 11 | + super().__init__(config) |
| 12 | + self.model = model |
| 13 | + self.scheduler = None |
| 14 | + |
| 15 | + def set_scheduler(self, scheduler): |
| 16 | + self.scheduler = scheduler |
| 17 | + |
| 18 | + @torch.no_grad() |
| 19 | + def infer(self, weights, inputs, kv_start=0, kv_end=0): |
| 20 | + del weights, kv_start, kv_end |
| 21 | + if self.scheduler is None: |
| 22 | + raise RuntimeError("MotusPreInfer requires a scheduler before infer().") |
| 23 | + |
| 24 | + first_frame = inputs["motus_first_frame"] |
| 25 | + state = inputs["motus_state"] |
| 26 | + instruction = inputs["motus_instruction"] |
| 27 | + t5_context = inputs["motus_t5_embeddings"] |
| 28 | + processed_t5_context = inputs["motus_processed_t5_context"] |
| 29 | + vlm_inputs = inputs["motus_vlm_inputs"] |
| 30 | + image_context = inputs["motus_image_context"] |
| 31 | + und_tokens = inputs["motus_und_tokens"] |
| 32 | + |
| 33 | + video_latents = self.scheduler.video_latents |
| 34 | + if video_latents.dim() != 5: |
| 35 | + raise RuntimeError(f"Expected video latents with shape [B, C, T, H, W], got {tuple(video_latents.shape)}") |
| 36 | + batch_size = state.shape[0] |
| 37 | + _, _, latent_t, latent_h, latent_w = video_latents.shape |
| 38 | + grid_sizes = torch.tensor( |
| 39 | + [[latent_t, latent_h // self.model.video_backbone.patch_size[1], latent_w // self.model.video_backbone.patch_size[2]]], |
| 40 | + dtype=torch.long, |
| 41 | + device=state.device, |
| 42 | + ).expand(batch_size, -1) |
| 43 | + grid_output = GridOutput( |
| 44 | + tensor=grid_sizes, |
| 45 | + tuple=tuple(int(v) for v in grid_sizes[0].tolist()), |
| 46 | + ) |
| 47 | + |
| 48 | + if self.cos_sin is None or self.grid_sizes != grid_output.tuple: |
| 49 | + self.grid_sizes = grid_output.tuple |
| 50 | + self.cos_sin = self.prepare_cos_sin(grid_output.tuple, self.freqs.clone()) |
| 51 | + |
| 52 | + dummy_embed = torch.empty(0, device=state.device, dtype=processed_t5_context.dtype) |
| 53 | + |
| 54 | + return MotusPreInferModuleOutput( |
| 55 | + embed=dummy_embed, |
| 56 | + grid_sizes=grid_output, |
| 57 | + x=self.scheduler.video_latents, |
| 58 | + embed0=dummy_embed, |
| 59 | + context=processed_t5_context, |
| 60 | + cos_sin=self.cos_sin, |
| 61 | + first_frame=first_frame, |
| 62 | + state=state, |
| 63 | + instruction=instruction, |
| 64 | + t5_embeddings=t5_context, |
| 65 | + vlm_inputs=vlm_inputs, |
| 66 | + image_context=image_context, |
| 67 | + und_tokens=und_tokens, |
| 68 | + condition_frame_latent=self.scheduler.condition_frame_latent, |
| 69 | + adapter_args={"instruction": instruction}, |
| 70 | + ) |
0 commit comments