diff --git a/comfy/samplers.py b/comfy/samplers.py index 0a4d062db042..755fb0a715d2 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -1006,8 +1006,19 @@ def sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callba return latent_image if latent_image.is_nested: - latent_image, latent_shapes = comfy.utils.pack_latents(latent_image.unbind()) - noise, _ = comfy.utils.pack_latents(noise.unbind()) + li_tensors = latent_image.unbind() + if noise.is_nested: + # Truncate extra noise components, pad missing ones with zeros + n_tensors = list(noise.unbind()[:len(li_tensors)]) + for i in range(len(n_tensors), len(li_tensors)): + n_tensors.append(torch.zeros_like(li_tensors[i])) + else: + # Noise only covers video -- pad remaining components (audio) with zeros + n_tensors = [noise] + for i in range(1, len(li_tensors)): + n_tensors.append(torch.zeros_like(li_tensors[i])) + latent_image, latent_shapes = comfy.utils.pack_latents(li_tensors) + noise, _ = comfy.utils.pack_latents(n_tensors) else: latent_shapes = [latent_image.shape]