Skip to content

Commit 534e2e4

Browse files
authored
fix bug for float duration src_audio (#1421)
1 parent 84a75de commit 534e2e4

3 files changed

Lines changed: 14 additions & 13 deletions

File tree

diffsynth/pipelines/ace_step.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -448,28 +448,27 @@ def pad_src_audio(self, pipe, src_audio, task_type, repainting_ranges):
448448
return src_audio, repainting_ranges, None, None
449449
min_left = min([start for start, end in repainting_ranges])
450450
max_right = max([end for start, end in repainting_ranges])
451-
total_length = src_audio.shape[-1] // pipe.vae.sampling_rate
452451
pad_left = max(0, -min_left)
453-
pad_right = max(0, max_right - total_length)
454-
if pad_left > 0 or pad_right > 0:
455-
padding_frames_left, padding_frames_right = pad_left * pipe.vae.sampling_rate, pad_right * pipe.vae.sampling_rate
452+
padding_frames_left = int(pad_left * pipe.vae.sampling_rate)
453+
padding_frames_right = max(int(max_right * pipe.vae.sampling_rate) - src_audio.shape[-1], 0)
454+
if padding_frames_left > 0 or padding_frames_right > 0:
456455
src_audio = F.pad(src_audio, (padding_frames_left, padding_frames_right), value=0.0)
457456
repainting_ranges = [(start + pad_left, end + pad_left) for start, end in repainting_ranges]
458-
return src_audio, repainting_ranges, pad_left, pad_right
457+
return src_audio, repainting_ranges, padding_frames_left, padding_frames_right
459458

460-
def parse_repaint_masks(self, pipe, src_latents, task_type, repainting_ranges, repainting_strength, pad_left, pad_right):
459+
def parse_repaint_masks(self, pipe, src_latents, task_type, repainting_ranges, repainting_strength, padding_frames_left, padding_frames_right):
461460
if task_type != "repaint" or repainting_ranges is None:
462461
return None, src_latents
463462
# let repainting area be repainting_strength, non-repainting area be 0.0, and blend at the boundary with cf_frames.
464463
max_latent_length = src_latents.shape[1]
465464
denoise_mask = torch.zeros((1, max_latent_length, 1), dtype=pipe.torch_dtype, device=pipe.device)
466465
for start, end in repainting_ranges:
467-
start_frame = start * pipe.vae.sampling_rate // 1920
468-
end_frame = end * pipe.vae.sampling_rate // 1920
466+
start_frame = int(start * pipe.vae.sampling_rate / 1920)
467+
end_frame = int(end * pipe.vae.sampling_rate / 1920)
469468
denoise_mask[:, start_frame:end_frame, :] = repainting_strength
470469
# set padding areas to 1.0 (full repaint) to avoid artifacts at the boundaries caused by padding
471-
pad_left_frames = pad_left * pipe.vae.sampling_rate // 1920
472-
pad_right_frames = pad_right * pipe.vae.sampling_rate // 1920
470+
pad_left_frames = int(padding_frames_left / 1920)
471+
pad_right_frames = int(padding_frames_right / 1920)
473472
denoise_mask[:, :pad_left_frames, :] = 1
474473
denoise_mask[:, max_latent_length - pad_right_frames:, :] = 1
475474

@@ -506,10 +505,12 @@ def process(self, pipe, duration, src_audio, audio_code_string, task_type=None,
506505
if task_type == "cover":
507506
lm_hints_5Hz = self.tokenize(pipe.tokenizer_model.tokenizer, src_latents, pipe.silence_latent, pipe.tokenizer_model.tokenizer.pool_window_size)
508507
src_latents = pipe.tokenizer_model.detokenizer(lm_hints_5Hz)
508+
if src_latents.shape[1] > source_latents.shape[1]:
509+
source_latents = torch.cat([source_latents, src_latents[:, source_latents.shape[1]:]], dim=1)
509510
max_latent_length = src_latents.shape[1]
510511
else:
511512
# use silence latents.
512-
max_latent_length = int(duration * pipe.sample_rate // 1920)
513+
max_latent_length = round(duration * pipe.sample_rate / 1920)
513514
src_latents = self._get_silence_latent_slice(pipe, max_latent_length).unsqueeze(0)
514515
chunk_masks = torch.ones((1, max_latent_length, src_latents.shape[-1]), dtype=torch.bool, device=pipe.device)
515516
attention_mask = torch.ones((1, max_latent_length), device=src_latents.device, dtype=pipe.torch_dtype)

docs/en/Model_Details/ACE-Step.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ The input parameters for `AceStepPipeline` inference include:
105105
* `seed`: Random seed.
106106
* `rand_device`: Device for noise generation, defaults to "cpu".
107107
* `num_inference_steps`: Number of inference steps, defaults to 8.
108-
* `shift`: Timestep shift parameter for the scheduler, defaults to 1.0.
108+
* `shift`: Timestep shift parameter for the scheduler, defaults to 3.0.
109109

110110
## Model Training
111111

docs/zh/Model_Details/ACE-Step.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ save_audio(audio, pipe.vae.sampling_rate, "acestep-v15-turbo.wav")
105105
* `seed`: 随机种子。
106106
* `rand_device`: 噪声生成设备,默认为 "cpu"。
107107
* `num_inference_steps`: 推理步数,默认为 8。
108-
* `shift`: 调度器时间偏移参数,默认为 1.0。
108+
* `shift`: 调度器时间偏移参数,默认为 3.0。
109109

110110
## 模型训练
111111

0 commit comments

Comments
 (0)