Add comfyui auto_target_shape for animate model infer

helloyongyang · helloyongyang · commit 9861558bf0f9 · 2026-05-25T06:36:09.000Z
diff --git a/configs/platforms/mlu/wan_animate.json b/configs/platforms/mlu/wan_animate.json
@@ -1,9 +1,10 @@
 {
     "infer_steps": 4,
-    "target_video_length": 77,
+    "target_video_length": 81,
     "text_len": 512,
-    "target_height": 720,
-    "target_width": 1280,
+    "auto_target_shape": false,
+    "target_height": 1280,
+    "target_width": 720,
     "self_attn_1_type": "mlu_flash_attn",
     "cross_attn_1_type": "mlu_flash_attn",
     "cross_attn_2_type": "mlu_flash_attn",
@@ -18,9 +19,7 @@
     "rms_norm_type": "mlu_rms_norm",
     "refert_num": 1,
     "replace_flag": false,
-    "fps": 30,
-    "denoising_step_list": [1000, 750, 500, 250],
-    "scheduler_type": "WanStepDistillScheduler",
+    "fps": 24,
     "lora_configs": [
         {
         "path": "lightx2v/Wan2.1-Distill-Loras/wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors",
diff --git a/configs/platforms/mlu/wan_animate_dist.json b/configs/platforms/mlu/wan_animate_dist.json
@@ -1,9 +1,10 @@
 {
     "infer_steps": 4,
-    "target_video_length": 77,
+    "target_video_length": 81,
     "text_len": 512,
-    "target_height": 720,
-    "target_width": 1280,
+    "auto_target_shape": false,
+    "target_height": 1280,
+    "target_width": 720,
     "self_attn_1_type": "mlu_flash_attn",
     "cross_attn_1_type": "mlu_flash_attn",
     "cross_attn_2_type": "mlu_flash_attn",
@@ -18,17 +19,15 @@
     "rms_norm_type": "mlu_rms_norm",
     "refert_num": 1,
     "replace_flag": false,
-    "fps": 30,
-    "denoising_step_list": [1000, 750, 500, 250],
-    "scheduler_type": "WanStepDistillScheduler",
+    "fps": 24,
     "lora_configs": [
         {
         "path": "lightx2v/Wan2.1-Distill-Loras/wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors",
         "strength": 1.0
         }
     ],
     "parallel": {
-        "seq_p_size": 8,
+        "seq_p_size": 2,
         "seq_p_attn_type": "ulysses"
     }
 }
diff --git a/configs/wan22/wan_animate_lora.json b/configs/wan22/wan_animate_lora.json
@@ -15,8 +15,6 @@
     "refert_num": 1,
     "replace_flag": false,
     "fps": 30,
-    "denoising_step_list": [1000, 750, 500, 250],
-    "scheduler_type": "WanStepDistillScheduler",
     "lora_configs": [
         {
         "path": "lightx2v/Wan2.1-Distill-Loras/wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors",
diff --git a/configs/wan22/wan_animate_lora_dist.json b/configs/wan22/wan_animate_lora_dist.json
@@ -15,8 +15,6 @@
     "refert_num": 1,
     "replace_flag": false,
     "fps": 30,
-    "denoising_step_list": [1000, 750, 500, 250],
-    "scheduler_type": "WanStepDistillScheduler",
     "lora_configs": [
         {
         "path": "lightx2v/Wan2.1-Distill-Loras/wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors",
diff --git a/lightx2v/models/runners/wan/wan_animate_runner.py b/lightx2v/models/runners/wan/wan_animate_runner.py
@@ -106,6 +106,40 @@ def padding_resize(
 
         return img_pad
 
+    def use_auto_target_shape(self):
+        return self.config.get("auto_target_shape", True)
+
+    def get_comfy_target_shape(self):
+        height = (int(self.config["target_height"]) // 16) * 16
+        width = (int(self.config["target_width"]) // 16) * 16
+        if height <= 0 or width <= 0:
+            raise ValueError(f"Invalid WanAnimate target shape: height={height}, width={width}")
+        return height, width
+
+    def center_crop_to_aspect(self, img, height, width):
+        ori_height, ori_width = img.shape[:2]
+        target_aspect = width / height
+        ori_aspect = ori_width / ori_height
+        if ori_aspect > target_aspect:
+            crop_width = max(1, round(ori_height * target_aspect))
+            x0 = max(0, (ori_width - crop_width) // 2)
+            img = img[:, x0 : x0 + crop_width]
+        elif ori_aspect < target_aspect:
+            crop_height = max(1, round(ori_width / target_aspect))
+            y0 = max(0, (ori_height - crop_height) // 2)
+            img = img[y0 : y0 + crop_height]
+        return img
+
+    def comfy_resize(self, img, height, width, interpolation=cv2.INTER_LANCZOS4, crop=None):
+        if crop == "center":
+            img = self.center_crop_to_aspect(img, height=height, width=width)
+        if img.shape[0] == height and img.shape[1] == width:
+            return img
+        return cv2.resize(img, (width, height), interpolation=interpolation)
+
+    def comfy_resize_frames(self, frames, height, width, interpolation=cv2.INTER_LANCZOS4, crop=None):
+        return np.stack([self.comfy_resize(frame, height, width, interpolation=interpolation, crop=crop) for frame in frames])
+
     def prepare_source(self, src_pose_path, src_face_path, src_ref_path):
         pose_video_reader = VideoReader(src_pose_path)
         pose_len = len(pose_video_reader)
@@ -118,7 +152,14 @@ def prepare_source(self, src_pose_path, src_face_path, src_ref_path):
         face_images = face_video_reader.get_batch(face_idxs).asnumpy()
         height, width = cond_images[0].shape[:2]
         refer_images = cv2.imread(src_ref_path)[..., ::-1]
-        refer_images = self.padding_resize(refer_images, height=height, width=width)
+        if self.use_auto_target_shape():
+            refer_images = self.padding_resize(refer_images, height=height, width=width)
+        else:
+            target_height, target_width = self.get_comfy_target_shape()
+            logger.info(f"WanAnimate uses config target shape: height={target_height}, width={target_width}")
+            cond_images = self.comfy_resize_frames(cond_images, target_height, target_width)
+            refer_images = self.comfy_resize(refer_images, target_height, target_width)
+            face_images = self.comfy_resize_frames(face_images, 512, 512, crop="center")
         return cond_images, face_images, refer_images
 
     def prepare_source_for_replace(self, src_bg_path, src_mask_path):
@@ -132,6 +173,10 @@ def prepare_source_for_replace(self, src_bg_path, src_mask_path):
         mask_idxs = list(range(mask_len))
         mask_images = mask_video_reader.get_batch(mask_idxs).asnumpy()
         mask_images = mask_images[:, :, :, 0] / 255
+        if not self.use_auto_target_shape():
+            target_height, target_width = self.get_comfy_target_shape()
+            bg_images = self.comfy_resize_frames(bg_images, target_height, target_width)
+            mask_images = self.comfy_resize_frames(mask_images, target_height, target_width, interpolation=cv2.INTER_NEAREST)
         return bg_images, mask_images
 
     @ProfilingContext4DebugL2("Run Image Encoders")
diff --git a/scripts/platforms/mlu/run_wan22_animate_dist.sh b/scripts/platforms/mlu/run_wan22_animate_dist.sh
@@ -25,7 +25,7 @@ python ${lightx2v_path}/tools/preprocess/preprocess_data.py \
     --resolution_area 1280 720 \
     --retarget_flag \
 
-torchrun --nproc_per_node=8 -m lightx2v.infer \
+torchrun --nproc_per_node=2 -m lightx2v.infer \
 --model_cls wan2.2_animate \
 --task animate \
 --model_path $model_path \

Original file line number	Diff line number	Diff line change
`@@ -15,8 +15,6 @@`
`15`	`15`	`"refert_num": 1,`
`16`	`16`	`"replace_flag": false,`
`17`	`17`	`"fps": 30,`
`18`		`- "denoising_step_list": [1000, 750, 500, 250],`
`19`		`- "scheduler_type": "WanStepDistillScheduler",`
`20`	`18`	`"lora_configs": [`
`21`	`19`	`{`
`22`	`20`	`"path": "lightx2v/Wan2.1-Distill-Loras/wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors",`