add helper func

xinyiqin · xinyiqin · commit 7f5b5db5dee6 · 2026-06-29T15:27:05.000+08:00
diff --git a/lightx2v/models/runners/wan/wan_audio_runner.py b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -2,7 +2,6 @@
 import io
 import json
 import os
-import subprocess
 import warnings
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Union
@@ -362,27 +361,35 @@ def get_audio_files_from_audio_path(self, audio_path):
 
         return audio_files, mask_files
 
+    def _get_image_resize_kwargs(self):
+        input_info = getattr(self, "input_info", None)
+        return {
+            "resize_mode": (getattr(input_info, "resize_mode", None) if input_info is not None else None) or self.config.get("resize_mode", "adaptive"),
+            "bucket_shape": self.config.get("bucket_shape", None),
+            "fixed_area": (getattr(input_info, "fixed_area", None) if input_info is not None else None) or self.config.get("fixed_area", None),
+            "fixed_shape": self.config.get("fixed_shape", None),
+        }
+
+    def _resolve_patched_spatial_size(self, h, w):
+        patched_h = h // self.config["vae_stride"][1] // self.config["patch_size"][1]
+        patched_w = w // self.config["vae_stride"][2] // self.config["patch_size"][2]
+        patched_h, patched_w = get_optimal_patched_size_with_sp(patched_h, patched_w, 1)
+        latent_h = patched_h * self.config["patch_size"][1]
+        latent_w = patched_w * self.config["patch_size"][2]
+        target_shape = [latent_h * self.config["vae_stride"][1], latent_w * self.config["vae_stride"][2]]
+        return target_shape, patched_h, patched_w
+
     def process_single_mask(self, mask_file):
         mask_img = load_image(mask_file)
         mask_img = TF.to_tensor(mask_img).sub_(0.5).div_(0.5).unsqueeze(0).to(AI_DEVICE)
 
         if mask_img.shape[1] == 3:  # If it is an RGB three-channel image
             mask_img = mask_img[:, :1]  # Only take the first channel
 
-        mask_img, h, w = resize_image(
-            mask_img,
-            resize_mode=self.config.get("resize_mode", "adaptive"),
-            bucket_shape=self.config.get("bucket_shape", None),
-            fixed_area=self.config.get("fixed_area", None),
-            fixed_shape=self.config.get("fixed_shape", None),
-        )
-
-        mask_latent = torch.nn.functional.interpolate(
-            mask_img,  # (1, 1, H, W)
-            size=(h // 16, w // 16),
-            mode="bicubic",
-        )
-
+        mask_img, h, w = resize_image(mask_img, **self._get_image_resize_kwargs())
+        target_shape, patched_h, patched_w = self._resolve_patched_spatial_size(h, w)
+        mask_img = F.interpolate(mask_img, size=(target_shape[0], target_shape[1]), mode="bicubic")
+        mask_latent = F.interpolate(mask_img, size=(patched_h, patched_w), mode="bicubic")
         mask_latent = (mask_latent > 0).to(torch.int8)
         return mask_latent
 
@@ -393,19 +400,9 @@ def read_image_input(self, img_path):
             ref_img = load_image(img_path)
         ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(0).to(AI_DEVICE)
 
-        ref_img, h, w = resize_image(
-            ref_img,
-            resize_mode=getattr(self.input_info, "resize_mode", None) or self.config.get("resize_mode", "adaptive"),
-            bucket_shape=self.config.get("bucket_shape", None),
-            fixed_area=getattr(self.input_info, "fixed_area", None) or self.config.get("fixed_area", None),
-            fixed_shape=self.config.get("fixed_shape", None),
-        )
+        ref_img, h, w = resize_image(ref_img, **self._get_image_resize_kwargs())
         logger.info(f"[wan_audio] resize_image target_h: {h}, target_w: {w}")
-        patched_h = h // self.config["vae_stride"][1] // self.config["patch_size"][1]
-        patched_w = w // self.config["vae_stride"][2] // self.config["patch_size"][2]
-
-        patched_h, patched_w = get_optimal_patched_size_with_sp(patched_h, patched_w, 1)
-
+        target_shape, patched_h, patched_w = self._resolve_patched_spatial_size(h, w)
         latent_h = patched_h * self.config["patch_size"][1]
         latent_w = patched_w * self.config["patch_size"][2]
 
@@ -415,11 +412,9 @@ def read_image_input(self, img_path):
         else:
             latent_shape = self.get_latent_shape_with_lat_hw(latent_h, latent_w)
 
-        target_shape = [latent_h * self.config["vae_stride"][1], latent_w * self.config["vae_stride"][2]]
-
         logger.info(f"[wan_audio] target_h: {target_shape[0]}, target_w: {target_shape[1]}, latent_h: {latent_h}, latent_w: {latent_w}")
 
-        ref_img = torch.nn.functional.interpolate(ref_img, size=(target_shape[0], target_shape[1]), mode="bicubic")
+        ref_img = F.interpolate(ref_img, size=(target_shape[0], target_shape[1]), mode="bicubic")
         return ref_img, latent_shape, target_shape
 
     @ProfilingContext4DebugL1(
@@ -732,26 +727,11 @@ def run_main(self):
 
             # fixed audio segments inputs
             if self.va_controller.reader is None:
-                # Save paths before super().run_main() clears input_info
-                out_path = getattr(self.input_info, "save_result_path", None)
-                orig_audio = (getattr(self.input_info, "audio_path", "") or "").split(",")[0].strip() or None
                 result = super().run_main()
                 # Stop VARecorder so ffmpeg finishes writing the file
                 if self.va_controller is not None:
                     self.va_controller.clear()
                     self.va_controller = None
-                # Re-mux with original audio to replace 16kHz audio
-                if out_path and orig_audio and os.path.isfile(out_path) and os.path.isfile(orig_audio):
-                    try:
-                        tmp = out_path + ".remux.mp4"
-                        cmd = ["ffmpeg", "-y", "-i", out_path, "-i", orig_audio,
-                               "-c:v", "copy", "-c:a", "copy",
-                               "-map", "0:v:0", "-map", "1:a:0", "-shortest", tmp]
-                        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-                        os.replace(tmp, out_path)
-                        logger.info(f"[wan_audio] Re-muxed with original audio: {orig_audio}")
-                    except Exception as exc:
-                        logger.warning(f"[wan_audio] Re-mux failed: {exc}")
                 return result
 
             self.va_controller.start()