@@ -448,28 +448,27 @@ def pad_src_audio(self, pipe, src_audio, task_type, repainting_ranges):
448448 return src_audio , repainting_ranges , None , None
449449 min_left = min ([start for start , end in repainting_ranges ])
450450 max_right = max ([end for start , end in repainting_ranges ])
451- total_length = src_audio .shape [- 1 ] // pipe .vae .sampling_rate
452451 pad_left = max (0 , - min_left )
453- pad_right = max ( 0 , max_right - total_length )
454- if pad_left > 0 or pad_right > 0 :
455- padding_frames_left , padding_frames_right = pad_left * pipe . vae . sampling_rate , pad_right * pipe . vae . sampling_rate
452+ padding_frames_left = int ( pad_left * pipe . vae . sampling_rate )
453+ padding_frames_right = max ( int ( max_right * pipe . vae . sampling_rate ) - src_audio . shape [ - 1 ], 0 )
454+ if padding_frames_left > 0 or padding_frames_right > 0 :
456455 src_audio = F .pad (src_audio , (padding_frames_left , padding_frames_right ), value = 0.0 )
457456 repainting_ranges = [(start + pad_left , end + pad_left ) for start , end in repainting_ranges ]
458- return src_audio , repainting_ranges , pad_left , pad_right
457+ return src_audio , repainting_ranges , padding_frames_left , padding_frames_right
459458
460- def parse_repaint_masks (self , pipe , src_latents , task_type , repainting_ranges , repainting_strength , pad_left , pad_right ):
459+ def parse_repaint_masks (self , pipe , src_latents , task_type , repainting_ranges , repainting_strength , padding_frames_left , padding_frames_right ):
461460 if task_type != "repaint" or repainting_ranges is None :
462461 return None , src_latents
463462 # let repainting area be repainting_strength, non-repainting area be 0.0, and blend at the boundary with cf_frames.
464463 max_latent_length = src_latents .shape [1 ]
465464 denoise_mask = torch .zeros ((1 , max_latent_length , 1 ), dtype = pipe .torch_dtype , device = pipe .device )
466465 for start , end in repainting_ranges :
467- start_frame = start * pipe .vae .sampling_rate // 1920
468- end_frame = end * pipe .vae .sampling_rate // 1920
466+ start_frame = int ( start * pipe .vae .sampling_rate / 1920 )
467+ end_frame = int ( end * pipe .vae .sampling_rate / 1920 )
469468 denoise_mask [:, start_frame :end_frame , :] = repainting_strength
470469 # set padding areas to 1.0 (full repaint) to avoid artifacts at the boundaries caused by padding
471- pad_left_frames = pad_left * pipe . vae . sampling_rate // 1920
472- pad_right_frames = pad_right * pipe . vae . sampling_rate // 1920
470+ pad_left_frames = int ( padding_frames_left / 1920 )
471+ pad_right_frames = int ( padding_frames_right / 1920 )
473472 denoise_mask [:, :pad_left_frames , :] = 1
474473 denoise_mask [:, max_latent_length - pad_right_frames :, :] = 1
475474
@@ -506,10 +505,12 @@ def process(self, pipe, duration, src_audio, audio_code_string, task_type=None,
506505 if task_type == "cover" :
507506 lm_hints_5Hz = self .tokenize (pipe .tokenizer_model .tokenizer , src_latents , pipe .silence_latent , pipe .tokenizer_model .tokenizer .pool_window_size )
508507 src_latents = pipe .tokenizer_model .detokenizer (lm_hints_5Hz )
508+ if src_latents .shape [1 ] > source_latents .shape [1 ]:
509+ source_latents = torch .cat ([source_latents , src_latents [:, source_latents .shape [1 ]:]], dim = 1 )
509510 max_latent_length = src_latents .shape [1 ]
510511 else :
511512 # use silence latents.
512- max_latent_length = int (duration * pipe .sample_rate / / 1920 )
513+ max_latent_length = round (duration * pipe .sample_rate / 1920 )
513514 src_latents = self ._get_silence_latent_slice (pipe , max_latent_length ).unsqueeze (0 )
514515 chunk_masks = torch .ones ((1 , max_latent_length , src_latents .shape [- 1 ]), dtype = torch .bool , device = pipe .device )
515516 attention_mask = torch .ones ((1 , max_latent_length ), device = src_latents .device , dtype = pipe .torch_dtype )
0 commit comments