update z-image

helloyongyang · helloyongyang · commit ea4def4498c0 · 2026-06-26T11:19:39.000Z
diff --git a/configs/z_image/z_image_turbo_i2i.json b/configs/z_image/z_image_turbo_i2i.json
@@ -0,0 +1,9 @@
+{
+    "num_channels_latents": 16,
+    "infer_steps": 9,
+    "attn_type": "flash_attn3",
+    "enable_cfg": false,
+    "sample_guide_scale": 0.0,
+    "patch_size": 2,
+    "i2i_denoise_strength": 1.0
+}
diff --git a/lightx2v/models/runners/z_image/z_image_runner.py b/lightx2v/models/runners/z_image/z_image_runner.py
@@ -159,7 +159,10 @@ def read_image_input(self, img_path):
 
     @ProfilingContext4DebugL2("Run Encoders")
     def _run_input_encoder_local_i2i(self):
-        image_paths_list = self.input_info.image_path.split(",")
+        image_paths_list = [image_path.strip() for image_path in self.input_info.image_path.split(",") if image_path.strip()]
+        if len(image_paths_list) != 1:
+            raise ValueError(f"z-image i2i currently supports exactly one input image, got {len(image_paths_list)}.")
+
         images_list = []
         for image_path in image_paths_list:
             _, image = self.read_image_input(image_path)
@@ -299,7 +302,18 @@ def get_input_target_shape(self):
             logger.info(f"Z Image Runner got custom shape: {width}x{height}")
             return (width, height)
 
-        aspect_ratio = self.input_info.aspect_ratio if self.input_info.aspect_ratio else self.config.get("aspect_ratio", None)
+        aspect_ratio = self.input_info.aspect_ratio
+        if aspect_ratio in as_maps:
+            logger.info(f"Z Image Runner got aspect ratio: {aspect_ratio}")
+            width, height = as_maps[aspect_ratio]
+            return (width, height)
+
+        if self.config["task"] == "i2i" and self.input_info.original_size:
+            width, height = self.input_info.original_size[-1]
+            logger.info(f"Z Image Runner got i2i source image shape: {width}x{height}")
+            return (width, height)
+
+        aspect_ratio = self.config.get("aspect_ratio", None)
         if aspect_ratio in as_maps:
             logger.info(f"Z Image Runner got aspect ratio: {aspect_ratio}")
             width, height = as_maps[aspect_ratio]
@@ -309,7 +323,7 @@ def get_input_target_shape(self):
         raise NotImplementedError
 
     def set_target_shape(self):
-        height, width = self.get_input_target_shape()
+        width, height = self.get_input_target_shape()
 
         # VAE applies 8x compression on images but we must also account for packing which requires
         # latent height and width to be divisible by 2.
@@ -326,7 +340,7 @@ def set_img_shapes(self):
                 raise ValueError(f"target_shape must be 4D [B, C, H, W], got {len(self.input_info.target_shape)}D: {self.input_info.target_shape}")
             _, _, latent_height, latent_width = self.input_info.target_shape
         else:
-            height, width = self.get_input_target_shape()
+            width, height = self.get_input_target_shape()
 
             vae_scale_factor = self.config["vae_scale_factor"]
             latent_height = 2 * (int(height) // (vae_scale_factor * 2))
diff --git a/lightx2v/models/schedulers/z_image/scheduler.py b/lightx2v/models/schedulers/z_image/scheduler.py
@@ -431,6 +431,60 @@ def create_coordinate_grid(size, start=None, device=None):
         grids = torch.meshgrid(axes, indexing="ij")
         return torch.stack(grids, dim=-1)
 
+    def _get_i2i_denoise_strength(self, input_info):
+        strength = getattr(input_info, "i2i_denoise_strength", None)
+        if strength is None:
+            strength = self.config.get("i2i_denoise_strength")
+        if strength is None:
+            return None
+        strength = float(strength)
+        if strength < 0.0 or strength > 1.0:
+            raise ValueError(f"The value of i2i_denoise_strength should be in [0.0, 1.0] but is {strength}")
+        return strength
+
+    def _get_single_i2i_image_latents(self, input_info):
+        image_encoder_output = getattr(input_info, "image_encoder_output", None)
+        if not image_encoder_output:
+            raise ValueError("z-image i2i requires exactly one input image with VAE image latents.")
+        if len(image_encoder_output) != 1:
+            raise ValueError(f"z-image i2i currently supports single-image editing only, got {len(image_encoder_output)} images.")
+        return image_encoder_output[0]["image_latents"]
+
+    def get_timesteps(self, num_inference_steps, strength):
+        target_steps = round(num_inference_steps * strength)
+        if target_steps < 1:
+            raise ValueError(
+                "i2i_denoise_strength results in 0 denoising steps: "
+                f"round(infer_steps * i2i_denoise_strength)=round({num_inference_steps} * {strength})={target_steps}; "
+                "please increase it to run at least 1 step."
+            )
+        t_start = num_inference_steps - target_steps
+        timesteps = self.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, target_steps
+
+    def _resize_i2i_image_latents(self, image_latents, target_height, target_width, target_channels):
+        if image_latents.ndim != 4:
+            raise ValueError(f"Expected z-image i2i image latents with shape [B, C, H, W], got {tuple(image_latents.shape)}")
+        if image_latents.shape[1] != target_channels:
+            raise ValueError(f"z-image i2i image latent channels {image_latents.shape[1]} do not match target channels {target_channels}.")
+        if image_latents.shape[-2:] != (target_height, target_width):
+            image_latents = F.interpolate(image_latents, size=(target_height, target_width), mode="bilinear", align_corners=False)
+        return image_latents
+
+    def prepare_i2i_denoise_strength_latents(self, input_info):
+        image_latents = self._get_single_i2i_image_latents(input_info).to(device=AI_DEVICE, dtype=self.dtype)
+        if self.latents.shape[0] != 1:
+            raise ValueError(f"z-image i2i currently supports single-image single-output editing only, got output latent batch {self.latents.shape[0]}.")
+
+        _, target_channels, target_height, target_width = self.latents.shape
+        image_latents = self._resize_i2i_image_latents(image_latents, target_height, target_width, target_channels)
+
+        latent_timestep = self.timesteps[:1]
+        noise = self.latents
+        self.latents = self.scheduler.scale_noise(image_latents, latent_timestep, noise)
+
     def prepare_latents(self, input_info):
         self.input_info = input_info
         shape = input_info.target_shape
@@ -477,7 +531,8 @@ def generate_freqs_cis_from_position_ids(self, position_ids: torch.Tensor, devic
 
     def set_timesteps(self):
         sigmas = np.linspace(1.0, 1 / self.config["infer_steps"], self.config["infer_steps"])
-        image_seq_len = self.latents.shape[1]
+        _, _, latent_height, latent_width = self.latents.shape
+        image_seq_len = (latent_height // 2) * (latent_width // 2)
         mu = calculate_shift(
             image_seq_len,
             self.scheduler_config.get("base_image_seq_len", 256),
@@ -497,6 +552,13 @@ def set_timesteps(self):
         self.timesteps = timesteps
         self.infer_steps = num_inference_steps
 
+        if self.config["task"] == "i2i":
+            strength = self._get_i2i_denoise_strength(self.input_info)
+            if strength is not None:
+                timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+                self.timesteps = timesteps
+                self.infer_steps = num_inference_steps
+
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         self._num_timesteps = len(timesteps)
         self.num_warmup_steps = num_warmup_steps
@@ -509,6 +571,9 @@ def prepare(self, input_info):
             logger.info(f"Generator is not None, using existing generator for latents")
         self.prepare_latents(input_info)
         self.set_timesteps()
+        strength = self._get_i2i_denoise_strength(input_info)
+        if self.config["task"] == "i2i" and strength is not None:
+            self.prepare_i2i_denoise_strength_latents(input_info)
 
         self.image_rotary_emb = self.pos_embed(self.input_info.image_shapes, input_info.txt_seq_lens[0], device=AI_DEVICE)
 
diff --git a/scripts/z_image/z_image_turbo_i2i.sh b/scripts/z_image/z_image_turbo_i2i.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# set path firstly
+lightx2v_path=/data/nvme1/yongyang/nb/LightX2V
+model_path=/data/nvme1/models/Tongyi-MAI/Z-Image-Turbo
+image_path=${lightx2v_path}/assets/inputs/imgs/img_0.jpg
+
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+python -m lightx2v.infer \
+--model_cls z_image \
+--task i2i \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/z_image/z_image_turbo_i2i.json \
+--image_path $image_path \
+--prompt "Change the cat to a dog." \
+--negative_prompt " " \
+--save_result_path ${lightx2v_path}/save_results/z_image_turbo_i2i.png \
+--seed 42 \
+--i2i_denoise_strength 1.0