From 0defd1da8133038644829ca1d57db178b203dd68 Mon Sep 17 00:00:00 2001 From: Sen Cao <101972681+Caxson@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:33:14 +0800 Subject: [PATCH] fix: use local token_hop_len in streaming loop to avoid concurrent state mutation The streaming loop in CosyVoice2Model.tts() mutates self.token_hop_len each iteration (via stream_scale_factor). When multiple requests share the same model instance, this shared state is corrupted across concurrent inferences. Use a local variable token_hop_len initialized from self.token_hop_len and update only the local copy inside the loop, so each streaming session has its own hop length progression. Behavior is unchanged for single-request usage. --- cosyvoice/cli/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 92a15d985..0363bc9fb 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -342,10 +342,11 @@ def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.ze p.start() if stream is True: token_offset = 0 - prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1]) + token_hop_len = self.token_hop_len + prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / token_hop_len) * token_hop_len - flow_prompt_speech_token.shape[1]) while True: time.sleep(0.1) - this_token_hop_len = self.token_hop_len + prompt_token_pad if token_offset == 0 else self.token_hop_len + this_token_hop_len = token_hop_len + prompt_token_pad if token_offset == 0 else token_hop_len if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= this_token_hop_len + self.flow.pre_lookahead_len: this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + this_token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0) this_tts_speech = self.token2wav(token=this_tts_speech_token, @@ -357,7 +358,7 @@ def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.ze stream=stream, finalize=False) token_offset += this_token_hop_len - self.token_hop_len = min(self.token_max_hop_len, self.token_hop_len * self.stream_scale_factor) + token_hop_len = min(self.token_max_hop_len, token_hop_len * self.stream_scale_factor) yield {'tts_speech': this_tts_speech.cpu()} if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len: break