diff --git a/bench_cosyvoice3.py b/bench_cosyvoice3.py new file mode 100644 index 000000000..1fe0006b4 --- /dev/null +++ b/bench_cosyvoice3.py @@ -0,0 +1,110 @@ +"""QPS benchmark for CosyVoice3. + +Usage: + python bench_cosyvoice3.py # vllm only + python bench_cosyvoice3.py --trt # vllm + trt + python bench_cosyvoice3.py --no-vllm # baseline (no acceleration) +""" +import sys, time, statistics, threading, queue, argparse +sys.path.append('third_party/Matcha-TTS') + +PROMPT_TEXT = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' +PROMPT_WAV = './asset/zero_shot_prompt.wav' + +TEXTS = { + 'short': '你好,今天天气真不错。', + 'medium': '收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', + 'long': '在人工智能技术飞速发展的今天,语音合成已经从早期生硬的拼接方式,进化到如今能够表达丰富情感、自然流畅的神经网络模型。CosyVoice 作为阿里达摩院推出的多语言语音生成模型,在零样本音色克隆、跨语种合成、多方言支持等方面都展现出了令人惊艳的能力,为众多应用场景带来了新的可能性。', +} + + +def run_once(model, text, seed=0): + from cosyvoice.utils.common import set_all_random_seed + set_all_random_seed(seed) + t0 = time.time() + audio_sec = 0.0 + for _, j in enumerate(model.inference_zero_shot(text, PROMPT_TEXT, PROMPT_WAV, stream=False)): + audio_sec += j['tts_speech'].shape[-1] / model.sample_rate + return time.time() - t0, audio_sec + + +def bench_sequential(model, iters=5): + print('\n=== Sequential ===', flush=True) + for name, text in TEXTS.items(): + run_once(model, text, seed=99) # warmup + walls, audios = [], [] + for i in range(iters): + w, a = run_once(model, text, seed=i) + walls.append(w); audios.append(a) + avg_w = statistics.mean(walls) + avg_a = statistics.mean(audios) + print(f'{name:>7} | chars={len(text):>3} | wall={avg_w:.2f}s audio={avg_a:.2f}s RTF={avg_w/avg_a:.3f}', flush=True) + + +def bench_concurrent(model, text_name='medium', concurrencies=(1, 2, 4, 8), per_round=4): + print(f'\n=== Concurrent (text={text_name}, per_round={per_round}) ===', flush=True) + text = TEXTS[text_name] + for conc in concurrencies: + total = conc * per_round + work_q = queue.Queue() + for i in range(total): + work_q.put(i) + latencies, audios = [], [] + lock = threading.Lock() + + def worker(): + while True: + try: + seed = work_q.get_nowait() + except queue.Empty: + return + w, a = run_once(model, text, seed=seed) + with lock: + latencies.append(w); audios.append(a) + + t0 = time.time() + threads = [threading.Thread(target=worker) for _ in range(conc)] + for t in threads: t.start() + for t in threads: t.join() + wall = time.time() - t0 + + if not latencies: continue + latencies.sort() + p50 = latencies[len(latencies) // 2] + p95 = latencies[int(len(latencies) * 0.95)] + qps = total / wall + rt = sum(audios) / wall + print(f'conc={conc} n={total} | QPS={qps:.2f} audio_thru={rt:.2f}x | lat avg={statistics.mean(latencies):.2f}s p50={p50:.2f}s p95={p95:.2f}s', flush=True) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('--trt', action='store_true') + ap.add_argument('--no-vllm', action='store_true') + ap.add_argument('--concurrent-only', action='store_true') + args = ap.parse_args() + + use_vllm = not args.no_vllm + use_trt = args.trt + + if use_vllm: + from vllm import ModelRegistry + from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM + ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM) + + from cosyvoice.cli.cosyvoice import AutoModel + + print(f'Config: vllm={use_vllm} trt={use_trt}', flush=True) + print('Loading...', flush=True) + t0 = time.time() + model = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', + load_trt=use_trt, load_vllm=use_vllm, fp16=False) + print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + + if not args.concurrent_only: + bench_sequential(model, iters=5) + bench_concurrent(model, text_name='medium', concurrencies=(1, 2, 4, 8), per_round=4) + + +if __name__ == '__main__': + main() diff --git a/bench_push.py b/bench_push.py new file mode 100644 index 000000000..a658ce33f --- /dev/null +++ b/bench_push.py @@ -0,0 +1,22 @@ +"""Push higher concurrency + short text benchmark.""" +import sys +sys.path.append('third_party/Matcha-TTS') + +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model('CosyVoice2ForCausalLM', CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +import bench_cosyvoice3 as B + + +def main(): + m = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False) + print('===SHORT TEXT, push concurrency===', flush=True) + B.bench_concurrent(m, text_name='short', concurrencies=(4, 8, 16, 32), per_round=4) + print('===MEDIUM TEXT, push concurrency===', flush=True) + B.bench_concurrent(m, text_name='medium', concurrencies=(8, 16, 32), per_round=2) + + +if __name__ == '__main__': + main() diff --git a/cosyvoice/bin/export_hift_onnx.py b/cosyvoice/bin/export_hift_onnx.py new file mode 100644 index 000000000..b275c7a62 --- /dev/null +++ b/cosyvoice/bin/export_hift_onnx.py @@ -0,0 +1,169 @@ +# Export the conv-only path of (Causal)HiFTGenerator.decode for TRT fp16. +# +# Split point: +# PyTorch (kept): f0_predictor -> sine source -> STFT(s) +# conv_pre (causal, takes 1 or 2 args by finalize flag) +# iSTFT, finalize-truncate, audio_limit clamp +# TRT (this export): leaky_relu + ups + (reflection_pad on last) + source_downs +# + source_resblocks + resblocks (Snake act) + conv_post +# + exp/sin to magnitude/phase -- the dense GPU work +# +# Inputs to the engine: x_post_conv_pre (B, base_channels, T_x), s_stft (B, n_fft+2, T_stft) +# Outputs: magnitude (B, n_fft//2+1, T_out), phase same shape +import argparse, os, sys, random +import torch +import torch.nn as nn +import torch.nn.functional as F +import onnxruntime +from tqdm import tqdm + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(f'{ROOT}/../..') +sys.path.append(f'{ROOT}/../../third_party/Matcha-TTS') + +from cosyvoice.cli.cosyvoice import AutoModel + + +def _strip_weight_norm(module: nn.Module): + """Remove weight_norm regardless of legacy hook or new parametrize API.""" + from torch.nn.utils import remove_weight_norm as _legacy + from torch.nn.utils.parametrize import remove_parametrizations + for m in module.modules(): + # New parametrize style (PyTorch >=2.4) + if hasattr(m, 'parametrizations') and 'weight' in getattr(m, 'parametrizations', {}): + try: + remove_parametrizations(m, 'weight', leave_parametrized=True) + continue + except Exception: + pass + # Legacy hook style + for hook in list(getattr(m, '_forward_pre_hooks', {}).values()): + if hook.__class__.__name__ == 'WeightNorm': + try: + _legacy(m, 'weight') + except Exception: + pass + break + + +class HiftDecoderConvBlock(nn.Module): + """The pure-conv post-conv_pre path of (Causal)HiFTGenerator.decode.""" + + def __init__(self, hift): + super().__init__() + self.ups = hift.ups + self.source_downs = hift.source_downs + self.source_resblocks = hift.source_resblocks + self.resblocks = hift.resblocks + self.conv_post = hift.conv_post + self.reflection_pad = hift.reflection_pad + self.lrelu_slope = hift.lrelu_slope + self.num_upsamples = hift.num_upsamples + self.num_kernels = hift.num_kernels + self.n_fft_half_p1 = hift.istft_params['n_fft'] // 2 + 1 + + def forward(self, x: torch.Tensor, s_stft: torch.Tensor): + for i in range(self.num_upsamples): + x = F.leaky_relu(x, self.lrelu_slope) + # ups[i] is CausalConv1dUpsample (CausalHiFTGenerator) or ConvTranspose1d. + # Both can be invoked with single arg; default empty cache hits zero-pad path. + x = self.ups[i](x) + if i == self.num_upsamples - 1: + x = self.reflection_pad(x) + si = self.source_downs[i](s_stft) + si = self.source_resblocks[i](si) + x = x + si + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs = xs + self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + magnitude = torch.exp(x[:, :self.n_fft_half_p1, :]) + phase = torch.sin(x[:, self.n_fft_half_p1:, :]) + return magnitude, phase + + +def _probe_shapes(hift, device): + # Build a dummy input by running the PyTorch path and snapshotting tensors at split points. + # T_x = mel chunk length post conv_pre (causal pad shrinks input by causal_padding). + # Use a representative chunk size: 25 tokens * 2 mel_ratio = 50 mel frames; conv_pre w/ pad=3 keeps T. + dummy_mel = torch.randn(1, 80, 80, device=device, dtype=torch.float32) + # f0 -> source -> STFT path mirrors CausalHiFTGenerator.inference (needs float64 f0 predictor) + hift.f0_predictor.to(torch.float64) + f0 = hift.f0_predictor(dummy_mel.to(torch.float64), finalize=True).to(dummy_mel) + s = hift.f0_upsamp(f0[:, None]).transpose(1, 2) + s, _, _ = hift.m_source(s) + s = s.transpose(1, 2) + # decode() preamble: + x = hift.conv_pre(dummy_mel) + s_real, s_imag = hift._stft(s.squeeze(1)) + s_stft = torch.cat([s_real, s_imag], dim=1) + return x, s_stft + + +@torch.no_grad() +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', default='pretrained_models/Fun-CosyVoice3-0.5B') + args = parser.parse_args() + + print(f'[export] loading {args.model_dir} ...', flush=True) + auto = AutoModel(model_dir=args.model_dir, load_trt=False, load_vllm=False, fp16=False) + hift = auto.model.hift + device = next(hift.parameters()).device + + print('[export] removing weight_norm on hift (new+legacy APIs) ...', flush=True) + _strip_weight_norm(hift) + hift.eval() + + block = HiftDecoderConvBlock(hift).eval().to(device) + + print('[export] probing tensor shapes via PyTorch fwd ...', flush=True) + x_dummy, s_stft_dummy = _probe_shapes(hift, device) + print(f' x={tuple(x_dummy.shape)} s_stft={tuple(s_stft_dummy.shape)}', flush=True) + + onnx_path = os.path.join(args.model_dir, 'hift.decoder.fp32.onnx') + print(f'[export] torch.onnx.export -> {onnx_path}', flush=True) + torch.onnx.export( + block, + (x_dummy, s_stft_dummy), + onnx_path, + export_params=True, + opset_version=18, + do_constant_folding=True, + input_names=['x', 's_stft'], + output_names=['magnitude', 'phase'], + dynamic_axes={ + 'x': {2: 'T_x'}, + 's_stft': {2: 'T_stft'}, + 'magnitude': {2: 'T_out'}, + 'phase': {2: 'T_out'}, + }, + ) + + # Sanity check: run via onnxruntime and compare to PyTorch. + print('[export] sanity check via onnxruntime CUDA EP ...', flush=True) + sess = onnxruntime.InferenceSession( + onnx_path, + providers=['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider'], + ) + # Sanity-check on the actual probed shapes (the only ones for which we know + # the exact T_stft / T_x relationship; the source-downs Conv1d ratios make + # arbitrary T_x impossible to test with random stub tensors). + out_pt = block(x_dummy, s_stft_dummy) + out_ort = sess.run(None, {'x': x_dummy.cpu().numpy(), 's_stft': s_stft_dummy.cpu().numpy()}) + for name, pt, ort in zip(['magnitude', 'phase'], out_pt, out_ort): + ort_t = torch.from_numpy(ort).to(device) + diff = (pt - ort_t).abs() + print(f' ort vs torch {name}: max_abs={diff.max().item():.3e} mean_abs={diff.mean().item():.3e} ' + f'shape={tuple(ort_t.shape)}') + + print(f'[export] done. ONNX size = {os.path.getsize(onnx_path) / 1e6:.1f} MB', flush=True) + + +if __name__ == '__main__': + main() diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 7ab04a70f..dec8df4ae 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -188,7 +188,8 @@ def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk class CosyVoice3(CosyVoice2): - def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1): + def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, + trt_concurrent=int(os.environ.get('FLOW_TRT_CONCURRENT', '4'))): self.model_dir = model_dir self.fp16 = fp16 if not os.path.exists(model_dir): @@ -222,6 +223,23 @@ def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_c '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir), trt_concurrent, self.fp16) + # HiFi-GAN decoder (post conv_pre) -> TRT, opt-in via env LOAD_TRT_HIFT=1. + # As of Round 13, hift TRT is fp16 by default. The Snake activation + # in cosyvoice/transformer/activation.py was patched to clamp + # inv_alpha at the source (max=6e4), which fixes the fp16 overflow + # that previously saturated audio (Round 6 regression). Pure fp16 + # is now safe AND fastest. Set HIFT_TRT_FP16=0 to revert to fp32. + if os.environ.get('LOAD_TRT_HIFT', '0') == '1': + hift_fp16 = os.environ.get('HIFT_TRT_FP16', '1') == '1' + hift_onnx = '{}/hift.decoder.fp32.onnx'.format(model_dir) + hift_engine = '{}/hift.decoder.{}.mygpu.plan'.format( + model_dir, 'fp16' if hift_fp16 else 'fp32') + if os.path.exists(hift_onnx): + self.model.load_trt_hift(hift_engine, hift_onnx, hift_fp16) + logging.info('hift TRT engine loaded ({}); decode patched'.format( + 'fp16' if hift_fp16 else 'fp32')) + else: + logging.warning('LOAD_TRT_HIFT=1 but {} not found; skipping'.format(hift_onnx)) del configs diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 92a15d985..294cc9db9 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -98,6 +98,89 @@ def get_trt_kwargs(self): input_names = ["x", "mask", "mu", "cond"] return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names} + def load_trt_hift(self, hift_engine_path, hift_onnx_path, fp16): + """Load (or build) the TRT engine for HiFi-GAN's conv-only decoder + path and monkey-patch hift.decode to use it. Engine takes + (x_post_conv_pre, s_stft) and returns (magnitude, phase).""" + import numpy as np + assert torch.cuda.is_available(), 'tensorrt only supports gpu!' + if not os.path.exists(hift_engine_path) or os.path.getsize(hift_engine_path) == 0: + # Internal Add op enforces T_stft = 120 * T_x + 1 exactly. + # min=10 covers finalize=False truncated chunks; max=600 covers full utterance. + trt_kwargs = { + 'min_shape': [(1, 512, 10), (1, 18, 1201)], + 'opt_shape': [(1, 512, 80), (1, 18, 9601)], + 'max_shape': [(1, 512, 600), (1, 18, 72001)], + 'input_names': ['x', 's_stft'], + } + # Snake activation in cosyvoice/transformer/activation.py now + # clamps inv_alpha at the source (max=6e4, fp16-safe) so the + # 4/10752 outlier alphas no longer trigger overflow. This makes + # pure fp16 hift engine viable WITHOUT OBEY_PRECISION_CONSTRAINTS. + # Set HIFT_TRT_FP32_KW=1 to force the per-layer fp32 fallback. + extra = {'fp32_layer_keywords': ['activations']} \ + if fp16 and os.environ.get('HIFT_TRT_FP32_KW', '0') == '1' else {} + convert_onnx_to_trt(hift_engine_path, trt_kwargs, hift_onnx_path, fp16, **extra) + import tensorrt as trt + import queue as _queue + with open(hift_engine_path, 'rb') as f: + engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read()) + assert engine is not None, 'failed to load hift trt {}'.format(hift_engine_path) + self._hift_trt_engine = engine + self._hift_trt_dtype = torch.float16 if fp16 else torch.float32 + + # Single context + lock. Tried: (a) Flow-style multi-ctx with dedicated + # streams (per-call sync killed perf, 3x worse); (b) multi-ctx with + # shared current_stream (slower than single-ctx, likely TRT-internal + # concurrent-context contention). Single-ctx + lock is the only stable + # variant; lock contention is small because TRT exec is async. + self._hift_trt_context = engine.create_execution_context() + self._hift_trt_lock = threading.Lock() + + hift = self.hift + n_fft_half = hift.istft_params['n_fft'] // 2 + 1 + upsample_prod = int(np.prod(hift.upsample_rates)) + hop_len = hift.istft_params['hop_len'] + engine_dtype = self._hift_trt_dtype + + def trt_decode(x, s=torch.zeros(1, 1, 0), finalize=True): + # Mirror CausalHiFTGenerator.decode preamble in PyTorch. + s_stft_real, s_stft_imag = hift._stft(s.squeeze(1)) + if finalize is True: + x_post = hift.conv_pre(x) + else: + x_post = hift.conv_pre(x[:, :, :-hift.conv_pre_look_right], x[:, :, -hift.conv_pre_look_right:]) + s_stft_real = s_stft_real[:, :, :-upsample_prod * hift.conv_pre_look_right] + s_stft_imag = s_stft_imag[:, :, :-upsample_prod * hift.conv_pre_look_right] + s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) + + # === TRT engine (single context + lock) === + x_in = x_post.to(engine_dtype).contiguous() + s_in = s_stft.to(engine_dtype).contiguous() + with self._hift_trt_lock: + ctx = self._hift_trt_context + ctx.set_input_shape('x', tuple(x_in.shape)) + ctx.set_input_shape('s_stft', tuple(s_in.shape)) + T_out = ctx.get_tensor_shape('magnitude')[2] + magnitude = torch.empty(x_in.shape[0], n_fft_half, T_out, device=x.device, dtype=engine_dtype) + phase = torch.empty_like(magnitude) + ctx.set_tensor_address('x', x_in.data_ptr()) + ctx.set_tensor_address('s_stft', s_in.data_ptr()) + ctx.set_tensor_address('magnitude', magnitude.data_ptr()) + ctx.set_tensor_address('phase', phase.data_ptr()) + assert ctx.execute_async_v3(torch.cuda.current_stream().cuda_stream), 'hift trt exec failed' + magnitude_f32 = magnitude.float() + phase_f32 = phase.float() + # === end TRT === + + audio = hift._istft(magnitude_f32, phase_f32) + if finalize is False: + audio = audio[:, :-upsample_prod * hop_len] + audio = torch.clamp(audio, -hift.audio_limit, hift.audio_limit) + return audio + + hift.decode = trt_decode + def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): cur_silent_token_num, max_silent_token_num = 0, 5 with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False): @@ -284,7 +367,10 @@ def load_vllm(self, model_dir): engine_args = EngineArgs(model=model_dir, skip_tokenizer_init=True, enable_prompt_embeds=True, - gpu_memory_utilization=0.2) + gpu_memory_utilization=float(os.environ.get('VLLM_GPU_UTIL', '0.6')), + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_num_seqs=int(os.environ.get('VLLM_MAX_SEQS', '64'))) self.llm.vllm = LLMEngine.from_engine_args(engine_args) self.llm.lock = threading.Lock() del self.llm.llm.model.model.layers diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index e8e81d942..02c99f21b 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -501,37 +501,65 @@ def inference( for token in self.inference_wrapper(lm_input, sampling, min_len, max_len, uuid): yield token + def _ensure_vllm_scheduler(self): + # Single dedicated thread owns vllm.step(); client threads block on + # their own per-uuid Queue. This collapses the per-client polling + # contention that capped concurrent QPS. + if getattr(self, '_vllm_scheduler_started', False): + return + with self.lock: + if getattr(self, '_vllm_scheduler_started', False): + return + self._vllm_scheduler_started = True + t = threading.Thread(target=self._vllm_scheduler_loop, daemon=True, + name='vllm-scheduler') + t.start() + + def _vllm_scheduler_loop(self): + while True: + try: + if not self.vllm.has_unfinished_requests(): + time.sleep(0.001) + continue + request_outputs = self.vllm.step() + for request_output in request_outputs: + top_ids = list(request_output.outputs[0].token_ids)[-1] + q = self.vllm_output_queue.get(request_output.request_id) + if q is not None: + q.put(top_ids) + except Exception as e: + # Surface but keep the scheduler alive so other reqs survive + print(f'[vllm-scheduler] {type(e).__name__}: {e}', flush=True) + time.sleep(0.01) + @torch.inference_mode() def inference_wrapper(self, lm_input, sampling, min_len, max_len, uuid): if hasattr(self, 'vllm'): - from vllm import SamplingParams, RequestOutput + from vllm import SamplingParams sampling_params = SamplingParams(top_k=sampling, stop_token_ids=self.stop_token_ids, min_tokens=min_len, max_tokens=max_len) + self._ensure_vllm_scheduler() + # Register the queue BEFORE add_request so the scheduler never + # has to drop a token because the dict isn't ready yet. + q = queue.Queue() with self.lock: + self.vllm_output_queue[uuid] = q self.vllm.add_request(uuid, {"prompt_embeds": lm_input.squeeze(0).to(torch.bfloat16).to(lm_input.device)}, sampling_params) - self.vllm_output_queue[uuid] = queue.Queue() out_tokens = [] - while True: - with self.lock: - if self.vllm_output_queue[uuid].empty() is True: - request_outputs: List[RequestOutput] = self.vllm.step() - for request_output in request_outputs: - top_ids = list(request_output.outputs[0].token_ids)[-1] - self.vllm_output_queue[request_output.request_id].put(top_ids) - if self.vllm_output_queue[uuid].empty() is False: - top_ids = self.vllm_output_queue[uuid].get() + try: + while True: + top_ids = q.get(timeout=120) # blocks; safety timeout if top_ids in self.stop_token_ids: break - # in stream mode, yield token one by one yield top_ids out_tokens.append(top_ids) if len(out_tokens) == max_len: break - time.sleep(0.001) - with self.lock: - self.vllm_output_queue.pop(uuid) + finally: + with self.lock: + self.vllm_output_queue.pop(uuid, None) else: out_tokens = [] cache = None diff --git a/cosyvoice/transformer/activation.py b/cosyvoice/transformer/activation.py index 8cea54816..394ba1271 100644 --- a/cosyvoice/transformer/activation.py +++ b/cosyvoice/transformer/activation.py @@ -75,10 +75,20 @@ def forward(self, x): Forward pass of the function. Applies the function to the input elementwise. Snake ∶= x + 1/a * sin^2 (xa) + + FP16-safety note: 4/10752 trained alpha channels in CosyVoice3 hift + have alpha < 1.6e-5, which sends 1/alpha past fp16 max=65504 and + saturates the entire downstream waveform when the engine is fp16 + (without per-layer precision constraints). Clamping inv_alpha to + a fp16-safe ceiling fixes this with negligible math change for the + 99.96% of channels with normal alpha values. ''' alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] if self.alpha_logscale: alpha = torch.exp(alpha) - x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + inv_alpha = 1.0 / (alpha + self.no_div_by_zero) + # 6e4 is just under fp16 max; affects only the few outlier channels + inv_alpha = torch.clamp(inv_alpha, max=6e4) + x = x + inv_alpha * pow(sin(x * alpha), 2) return x diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index b173ef201..902df810c 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -50,7 +50,14 @@ def load_wav(wav, target_sr, min_sr=16000): return speech -def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16): +def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16, fp32_layer_keywords=None): + """ + fp32_layer_keywords: optional iterable of substrings; any TRT layer whose + name OR op type contains one of these (case-insensitive) is forced to + run in fp32 even when the engine is built in fp16. Used to protect + numerically sensitive ops (e.g., Snake activation: 1/alpha * sin(alpha*x)^2 + overflows fp16 when alpha is small). + """ import tensorrt as trt logging.info("Converting onnx to trt...") network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) @@ -80,6 +87,21 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16): for i in range(network.num_outputs): output_tensor = network.get_output(i) output_tensor.dtype = tensor_dtype + # Per-layer fp32 overrides for numerically sensitive ops. + if fp16 and fp32_layer_keywords: + config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS) + keys = [k.lower() for k in fp32_layer_keywords] + forced = 0 + for li in range(network.num_layers): + layer = network.get_layer(li) + sig = (layer.name + ' ' + str(layer.type)).lower() + if any(k in sig for k in keys): + layer.precision = trt.DataType.FLOAT + for j in range(layer.num_outputs): + layer.set_output_type(j, trt.DataType.FLOAT) + forced += 1 + logging.info("forced %d/%d layers to fp32 (keywords=%s)", + forced, network.num_layers, list(fp32_layer_keywords)) config.add_optimization_profile(profile) engine_bytes = builder.build_serialized_network(network, config) # save trt engine diff --git a/dump_onnx_nodes.py b/dump_onnx_nodes.py new file mode 100644 index 000000000..15bfead50 --- /dev/null +++ b/dump_onnx_nodes.py @@ -0,0 +1,44 @@ +"""Dump ONNX node names + op types, focus on Snake-related ops, to find a +narrower keyword set than 'sin,pow,reciprocal,div' for the TRT fp32 override.""" +import sys, onnx +from collections import Counter + +p = sys.argv[1] if len(sys.argv) > 1 else 'pretrained_models/Fun-CosyVoice3-0.5B/hift.decoder.fp32.onnx' +m = onnx.load(p) +g = m.graph + +print(f'graph: {len(g.node)} nodes') +op_counts = Counter(n.op_type for n in g.node) +print('\nop type counts (top 20):') +for op, c in op_counts.most_common(20): + print(f' {op:>20} : {c}') + +# Snake decomposes to: Mul (alpha*x) -> Sin -> Pow(2) -> Add(alpha+eps) -> Reciprocal -> Mul -> Add(x+...) +# Look at node names containing 'Snake' or 'activation' (PyTorch module names) +print('\nnodes with "Snake" or "activation" in name (first 40):') +relevant = [n for n in g.node if 'snake' in n.name.lower() or 'activation' in n.name.lower()] +for n in relevant[:40]: + inputs = [i for i in n.input if not i.startswith('onnx::')][:2] + print(f' {n.op_type:>15} {n.name} inputs={inputs}') + +# Show a sample of Reciprocal nodes +print('\nall Reciprocal nodes:') +for n in g.node: + if n.op_type == 'Reciprocal': + print(f' {n.name} input={list(n.input)[:1]}') + +# All ops with "alpha" related inputs (initializers contain 'alpha') +alpha_initializers = {init.name for init in g.initializer if 'alpha' in init.name.lower()} +print(f'\n{len(alpha_initializers)} initializers with "alpha" in name (first 5):') +for n in list(alpha_initializers)[:5]: + print(f' {n}') + +# Find ops whose inputs reference an alpha initializer (these are the Snake math ops) +print('\nfirst 5 nodes whose input references alpha:') +alpha_consumers = [] +for n in g.node: + if any(i in alpha_initializers for i in n.input): + alpha_consumers.append(n) +for n in alpha_consumers[:5]: + print(f' {n.op_type:>15} {n.name}') +print(f'... total alpha consumers: {len(alpha_consumers)}') diff --git a/dump_snake_alphas.py b/dump_snake_alphas.py new file mode 100644 index 000000000..3c30975e7 --- /dev/null +++ b/dump_snake_alphas.py @@ -0,0 +1,30 @@ +"""Dump trained Snake alpha values from hift.pt to see if 1/alpha actually +overflows fp16 (max 65504). If all alphas >= 1.5e-5, fp16 overflow can't be +the bug, and R6's saturated audio has a different cause.""" +import sys, torch +import numpy as np + +ckpt = sys.argv[1] if len(sys.argv) > 1 else 'pretrained_models/Fun-CosyVoice3-0.5B/hift.pt' +sd = torch.load(ckpt, map_location='cpu', weights_only=True) +print(f'loaded {ckpt}: {len(sd)} keys') + +alpha_keys = [k for k in sd.keys() if k.endswith('.alpha')] +print(f'\nfound {len(alpha_keys)} Snake alpha tensors\n') + +all_vals = [] +problem_count = 0 +for k in alpha_keys[:10]: # sample first 10 + a = sd[k].abs() + inv_a_max = (1.0 / (a + 1e-9)).max().item() + inv_a_min = (1.0 / (a + 1e-9)).min().item() + print(f' {k:60s} shape={tuple(a.shape)} alpha [min,max]=[{a.min().item():.4e}, {a.max().item():.4e}] 1/alpha max={inv_a_max:.2e}') + all_vals.append(a.flatten()) + +all_vals = torch.cat([sd[k].abs().flatten() for k in alpha_keys]) +inv_all = 1.0 / (all_vals + 1e-9) +print(f'\n=== overall stats over {len(all_vals)} alpha values ===') +print(f' alpha min={all_vals.min().item():.4e} max={all_vals.max().item():.4e} mean={all_vals.mean().item():.4e}') +print(f' 1/alpha max={inv_all.max().item():.4e} fp16 max=65504') +unsafe = (inv_all > 65504).sum().item() +print(f' values where 1/alpha > 65504 (fp16 overflow): {unsafe} / {len(all_vals)}') +print(f' values where 1/alpha > 6500 (close to limit): {(inv_all > 6500).sum().item()}') diff --git a/eval/quality_eval.py b/eval/quality_eval.py new file mode 100644 index 000000000..b539d480f --- /dev/null +++ b/eval/quality_eval.py @@ -0,0 +1,245 @@ +"""Automated audio quality evaluation across CosyVoice optimization rounds. + +Computes the following metrics on each WAV in samples/round*/: + + Whisper WER -- intelligibility regression detector (catches fp16 NaN + pronouncing wrong, quantization artifacts collapsing + phonemes). Compares Whisper transcript to the reference + text the sample was generated from. + SECS -- speaker similarity to the prompt audio + (asset/zero_shot_prompt.wav). Uses ECAPA-TDNN. Cosine + similarity in [-1, 1]; >=0.90 is "kept the voice", + <0.85 = regression. + RMS energy -- gross sanity (zeroed-out / clipping detection). + Duration -- catches truncation regressions. + +Usage +----- + python quality_eval.py --samples-root samples --reference-prompt asset/zero_shot_prompt.wav + +Optional metrics (pass --with-dnsmos): DNSMOS perceptual quality (1-5). +Slow on first run (downloads weights). Skipped by default. + +Dependencies (install in venv): + pip install openai-whisper speechbrain torchaudio + # optional: pip install dnsmos +""" +import argparse +import json +import os +import sys +from pathlib import Path + +import torch +import torchaudio +from torch.nn.functional import cosine_similarity + +# Map sample filename prefix -> reference text the sample was generated from. +# Matches the prompts used in samples/round*/ generation (curl loops in commits). +REFERENCE_TEXTS = { + '你好欢迎': '你好欢迎', + '阿里云Cos': '阿里云CosyVoice三号是当前开源里最先进的多语言语音合成系统之一', + 'long': '昨天我去图书馆借了三本关于人工智能的书,发现现代深度学习模型的发展速度真的非常惊人。' + '短短几年时间,从GPT-2到GPT-4,再到现在的多模态大模型,每一代都有质的飞跃。' + '我相信未来十年内,人工智能将会彻底改变我们的工作和生活方式。', +} + + +def find_reference_text(filename: str) -> str | None: + """Match a wav filename to the text it was generated from.""" + stem = Path(filename).stem + for prefix, text in REFERENCE_TEXTS.items(): + if stem.startswith(prefix): + return text + return None + + +def load_wav(path: Path, target_sr: int): + # Use soundfile to dodge torchaudio>=2.11's torchcodec dependency. + import soundfile as sf + data, sr = sf.read(str(path), always_2d=True) # (T, C) float64 + wav = torch.from_numpy(data.T).float() # (C, T) + if wav.shape[0] > 1: + wav = wav.mean(dim=0, keepdim=True) + if sr != target_sr: + wav = torchaudio.functional.resample(wav, sr, target_sr) + return wav, target_sr + + +def normalize_text(s: str) -> str: + """Strip whitespace + punctuation + lowercase -- crude CER pre-processing + so Whisper transcripts compare fairly against the prompts.""" + import re + s = re.sub(r'[,。!?、;:""''""()()【】《》\s\.,!?;:\'\"\-\[\]<>]', '', s) + return s.lower().strip() + + +def cer(ref: str, hyp: str) -> float: + """Character Error Rate via Levenshtein distance.""" + r = list(normalize_text(ref)) + h = list(normalize_text(hyp)) + if not r: + return 0.0 if not h else 1.0 + # DP edit distance + n, m = len(r), len(h) + dp = [[0] * (m + 1) for _ in range(n + 1)] + for i in range(n + 1): + dp[i][0] = i + for j in range(m + 1): + dp[0][j] = j + for i in range(1, n + 1): + for j in range(1, m + 1): + cost = 0 if r[i - 1] == h[j - 1] else 1 + dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost) + return dp[n][m] / n + + +def init_whisper(device: str): + import whisper + print(f'[whisper] loading "base" model on {device} ...', flush=True) + return whisper.load_model('base', device=device) + + +def init_secs(device: str): + from speechbrain.inference.speaker import EncoderClassifier + print(f'[secs] loading speechbrain ECAPA-TDNN on {device} ...', flush=True) + return EncoderClassifier.from_hparams( + source='speechbrain/spkrec-ecapa-voxceleb', + run_opts={'device': device}, + savedir='/tmp/spkrec-ecapa-voxceleb', + ) + + +def whisper_transcribe(model, wav_path: Path) -> str: + # Whisper handles its own resampling; pass the file path + result = model.transcribe(str(wav_path), language='zh', fp16=torch.cuda.is_available()) + return result['text'] + + +def secs_against_ref(secs_model, sample_path: Path, ref_emb): + wav, _ = load_wav(sample_path, 16000) + emb = secs_model.encode_batch(wav) + return cosine_similarity(ref_emb.flatten().unsqueeze(0), emb.flatten().unsqueeze(0)).item() + + +def rms_db(wav_path: Path) -> float: + wav, _ = load_wav(wav_path, 16000) + rms = wav.pow(2).mean().sqrt().item() + return 20 * torch.log10(torch.tensor(max(rms, 1e-10))).item() + + +def duration_s(wav_path: Path) -> float: + import soundfile as sf + info = sf.info(str(wav_path)) + return info.frames / info.samplerate + + +def evaluate_round(round_dir: Path, whisper_model, secs_model, ref_emb, with_dnsmos: bool): + """Return dict with per-sample scores + per-round aggregates.""" + samples = sorted(round_dir.glob('*.wav')) + rows = [] + for w in samples: + ref_text = find_reference_text(w.name) + cer_score = None + transcript = '' + if ref_text and whisper_model is not None: + try: + transcript = whisper_transcribe(whisper_model, w) + cer_score = cer(ref_text, transcript) + except Exception as e: + print(f' [whisper-fail] {w.name}: {e}', flush=True) + secs_score = None + if secs_model is not None and ref_emb is not None: + try: + secs_score = secs_against_ref(secs_model, w, ref_emb) + except Exception as e: + print(f' [secs-fail] {w.name}: {e}', flush=True) + rows.append({ + 'file': w.name, + 'duration_s': round(duration_s(w), 3), + 'rms_db': round(rms_db(w), 2), + 'whisper_cer': round(cer_score, 4) if cer_score is not None else None, + 'whisper_text': transcript, + 'secs': round(secs_score, 4) if secs_score is not None else None, + }) + if not rows: + return None + + def _avg(key): + vs = [r[key] for r in rows if r[key] is not None] + return round(sum(vs) / len(vs), 4) if vs else None + + return { + 'round': round_dir.name, + 'n_samples': len(rows), + 'avg_cer': _avg('whisper_cer'), + 'avg_secs': _avg('secs'), + 'avg_rms_db': _avg('rms_db'), + 'avg_dur_s': _avg('duration_s'), + 'samples': rows, + } + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('--samples-root', default='samples') + ap.add_argument('--reference-prompt', default='asset/zero_shot_prompt.wav') + ap.add_argument('--out-json', default='eval/quality_report.json') + ap.add_argument('--out-md', default='eval/quality_report.md') + ap.add_argument('--device', default='cuda' if torch.cuda.is_available() else 'cpu') + ap.add_argument('--skip-whisper', action='store_true') + ap.add_argument('--skip-secs', action='store_true') + ap.add_argument('--with-dnsmos', action='store_true', help='(not yet wired)') + args = ap.parse_args() + + samples_root = Path(args.samples_root) + rounds = sorted(p for p in samples_root.iterdir() if p.is_dir()) + if not rounds: + print(f'no rounds found under {samples_root}', file=sys.stderr) + sys.exit(1) + print(f'found {len(rounds)} rounds: {[r.name for r in rounds]}', flush=True) + + whisper_model = None if args.skip_whisper else init_whisper(args.device) + secs_model = None + ref_emb = None + if not args.skip_secs: + secs_model = init_secs(args.device) + ref_wav, _ = load_wav(Path(args.reference_prompt), 16000) + ref_emb = secs_model.encode_batch(ref_wav) + print(f'[secs] reference embedding ready (shape={tuple(ref_emb.shape)})', flush=True) + + results = [] + for r in rounds: + print(f'\n=== {r.name} ===', flush=True) + out = evaluate_round(r, whisper_model, secs_model, ref_emb, args.with_dnsmos) + if out: + results.append(out) + print(f' avg CER={out["avg_cer"]} SECS={out["avg_secs"]} RMS_dB={out["avg_rms_db"]}', flush=True) + + Path(args.out_json).parent.mkdir(parents=True, exist_ok=True) + with open(args.out_json, 'w', encoding='utf-8') as f: + json.dump(results, f, ensure_ascii=False, indent=2) + print(f'\nwrote {args.out_json}', flush=True) + + # Markdown summary + lines = ['# Audio quality across optimization rounds', '', + '| round | n | avg CER | avg SECS | avg RMS dB | avg dur s | flag |', + '|---|---:|---:|---:|---:|---:|---|'] + base = results[0] + for row in results: + flag = '' + if base['avg_cer'] is not None and row['avg_cer'] is not None: + if row['avg_cer'] > base['avg_cer'] + 0.05: + flag += ' INTELLIGIBILITY ' + if base['avg_secs'] is not None and row['avg_secs'] is not None: + if row['avg_secs'] < base['avg_secs'] - 0.05: + flag += ' VOICE ' + lines.append(f'| {row["round"]} | {row["n_samples"]} | {row["avg_cer"]} | ' + f'{row["avg_secs"]} | {row["avg_rms_db"]} | {row["avg_dur_s"]} | {flag.strip() or "-"} |') + with open(args.out_md, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines) + '\n') + print(f'wrote {args.out_md}', flush=True) + + +if __name__ == '__main__': + main() diff --git a/eval/quality_report.json b/eval/quality_report.json new file mode 100644 index 000000000..dd0d9f977 --- /dev/null +++ b/eval/quality_report.json @@ -0,0 +1,472 @@ +[ + { + "round": "round0_baseline", + "n_samples": 4, + "avg_cer": 0.2536, + "avg_secs": 0.6065, + "avg_rms_db": -21.645, + "avg_dur_s": 4.33, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.6, + "rms_db": -22.44, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.4029 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.52, + "rms_db": -20.94, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.6009 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 7.04, + "rms_db": -20.85, + "whisper_cer": 0.3143, + "whisper_text": "阿里云 科斯科尔斯3号是当前开员里最先进的多语言语音合成系统之一", + "secs": 0.6824 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 7.16, + "rms_db": -22.35, + "whisper_cer": 0.2, + "whisper_text": "阿里云Cosey Voice 3号使当前开原理最先进的多语言语音和程系统之一", + "secs": 0.7398 + } + ] + }, + { + "round": "round10_hift_fp32", + "n_samples": 4, + "avg_cer": 0.2339, + "avg_secs": 0.6153, + "avg_rms_db": -20.29, + "avg_dur_s": 4.11, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.12, + "rms_db": -20.98, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.469 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.96, + "rms_db": -17.03, + "whisper_cer": 0.0, + "whisper_text": "你好 欢迎", + "secs": 0.635 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.8, + "rms_db": -22.07, + "whisper_cer": 0.3143, + "whisper_text": "阿里云Cosey Voice 3號是當前開員裡最先進的多餘言語音合成系統之一", + "secs": 0.7085 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.56, + "rms_db": -21.08, + "whisper_cer": 0.3714, + "whisper_text": "阿里云CoseyBoss3号是当前开原理最先进的多余言云和成细统之一", + "secs": 0.6487 + } + ] + }, + { + "round": "round11_hift_fp16_snake32", + "n_samples": 4, + "avg_cer": 0.2339, + "avg_secs": 0.6139, + "avg_rms_db": -20.23, + "avg_dur_s": 4.11, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.12, + "rms_db": -20.92, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.4666 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.96, + "rms_db": -16.99, + "whisper_cer": 0.0, + "whisper_text": "你好 欢迎", + "secs": 0.6314 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.8, + "rms_db": -22.0, + "whisper_cer": 0.3143, + "whisper_text": "阿里云Cosey Voice 3號是當前開員裡最先進的多餘言語音合成系統之一", + "secs": 0.7084 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.56, + "rms_db": -21.01, + "whisper_cer": 0.3714, + "whisper_text": "阿里云CoseyBoss3号是当前开原理最先进的多余言云和成细统之一", + "secs": 0.649 + } + ] + }, + { + "round": "round12_hift_fp16_precise", + "n_samples": 4, + "avg_cer": 0.2339, + "avg_secs": 0.6148, + "avg_rms_db": -20.23, + "avg_dur_s": 4.11, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.12, + "rms_db": -20.92, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.469 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.96, + "rms_db": -16.99, + "whisper_cer": 0.0, + "whisper_text": "你好 欢迎", + "secs": 0.6331 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.8, + "rms_db": -22.0, + "whisper_cer": 0.3143, + "whisper_text": "阿里云Cosey Voice 3號是當前開員裡最先進的多餘言語音合成系統之一", + "secs": 0.7075 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.56, + "rms_db": -21.01, + "whisper_cer": 0.3714, + "whisper_text": "阿里云CoseyBoss3号是当前开原理最先进的多余言云和成细统之一", + "secs": 0.6497 + } + ] + }, + { + "round": "round13_hift_fp16_clamped", + "n_samples": 4, + "avg_cer": 0.2339, + "avg_secs": 0.6148, + "avg_rms_db": -20.23, + "avg_dur_s": 4.11, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.12, + "rms_db": -20.92, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.469 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.96, + "rms_db": -16.99, + "whisper_cer": 0.0, + "whisper_text": "你好 欢迎", + "secs": 0.6331 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.8, + "rms_db": -22.0, + "whisper_cer": 0.3143, + "whisper_text": "阿里云Cosey Voice 3號是當前開員裡最先進的多餘言語音合成系統之一", + "secs": 0.7075 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.56, + "rms_db": -21.01, + "whisper_cer": 0.3714, + "whisper_text": "阿里云CoseyBoss3号是当前开原理最先进的多余言云和成细统之一", + "secs": 0.6497 + } + ] + }, + { + "round": "round1_fp16", + "n_samples": 5, + "avg_cer": 0.1835, + "avg_secs": 0.6719, + "avg_rms_db": -20.026, + "avg_dur_s": 7.744, + "samples": [ + { + "file": "long_s0.wav", + "duration_s": 22.36, + "rms_db": -21.37, + "whisper_cer": 0.0673, + "whisper_text": "昨天我去图书馆借了三本关于人工智能的书发现现代深度学习模型的发展速度真的非常清人难单几年时间从GPT-2到GPT-4再到现在的多摩太大模型每一代都有智的飞越我相信未来十年内人工智能将会彻底改变我们的工作和生活方式", + "secs": 0.7316 + }, + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.6, + "rms_db": -18.28, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.534 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 2.12, + "rms_db": -16.61, + "whisper_cer": 0.0, + "whisper_text": "你好,欢迎", + "secs": 0.5721 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.28, + "rms_db": -22.24, + "whisper_cer": 0.2286, + "whisper_text": "阿里云Cosey Voice 3号是当前开原理最先进的多语言语音和成细统G", + "secs": 0.7317 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.36, + "rms_db": -21.63, + "whisper_cer": 0.3714, + "whisper_text": "阿里云 科斯科斯3号是当前开员里最先进的多语言语音和诚系统之一", + "secs": 0.7901 + } + ] + }, + { + "round": "round2_vllm", + "n_samples": 4, + "avg_cer": 0.2143, + "avg_secs": 0.6764, + "avg_rms_db": -21.1125, + "avg_dur_s": 4.12, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.52, + "rms_db": -19.59, + "whisper_cer": 0.0, + "whisper_text": "你好,欢迎", + "secs": 0.5814 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.68, + "rms_db": -20.69, + "whisper_cer": 0.0, + "whisper_text": "你好,欢迎", + "secs": 0.589 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.84, + "rms_db": -21.41, + "whisper_cer": 0.4286, + "whisper_text": "阿里文 Cosey Voice 3號時當前開員裡最先進達多餘言語音和成系統之一", + "secs": 0.7435 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.44, + "rms_db": -22.76, + "whisper_cer": 0.4286, + "whisper_text": "阿里云 高CBOS3号时当前开原理最先进的多语言与音和成细统之一", + "secs": 0.7916 + } + ] + }, + { + "round": "round3_lockfree", + "n_samples": 4, + "avg_cer": 0.2697, + "avg_secs": 0.6617, + "avg_rms_db": -20.415, + "avg_dur_s": 4.19, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.68, + "rms_db": -19.84, + "whisper_cer": 0.0, + "whisper_text": "你好欢迎", + "secs": 0.6292 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.64, + "rms_db": -18.13, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.638 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.44, + "rms_db": -21.77, + "whisper_cer": 0.3143, + "whisper_text": "阿里云 科斯科斯3号是当前开员里最先进的多语言语音合成系统之一", + "secs": 0.6852 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 7.0, + "rms_db": -21.92, + "whisper_cer": 0.5143, + "whisper_text": "阿里云·Kosivo 3號使當前開員裡最先進的多於言語音和成系統之一", + "secs": 0.6944 + } + ] + }, + { + "round": "round6_hift_trt", + "n_samples": 4, + "avg_cer": 1.0, + "avg_secs": -0.1377, + "avg_rms_db": 0.0, + "avg_dur_s": 4.53, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.8, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1333 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.44, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1331 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 7.56, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1422 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 7.32, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1422 + } + ] + }, + { + "round": "round7_fixed", + "n_samples": 4, + "avg_cer": 0.2339, + "avg_secs": 0.6151, + "avg_rms_db": -20.29, + "avg_dur_s": 4.11, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.12, + "rms_db": -20.98, + "whisper_cer": 0.25, + "whisper_text": "你好,歡迎", + "secs": 0.47 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.96, + "rms_db": -17.03, + "whisper_cer": 0.0, + "whisper_text": "你好 欢迎", + "secs": 0.6337 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.8, + "rms_db": -22.07, + "whisper_cer": 0.3143, + "whisper_text": "阿里云Cosey Voice 3號是當前開員裡最先進的多餘言語音合成系統之一", + "secs": 0.7081 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 6.56, + "rms_db": -21.08, + "whisper_cer": 0.3714, + "whisper_text": "阿里云CoseyBoss3号是当前开原理最先进的多余言云和成细统之一", + "secs": 0.6485 + } + ] + }, + { + "round": "round7_flow_concurrent", + "n_samples": 4, + "avg_cer": 1.0, + "avg_secs": -0.1376, + "avg_rms_db": 0.0, + "avg_dur_s": 4.27, + "samples": [ + { + "file": "你好欢迎_s0.wav", + "duration_s": 1.4, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1331 + }, + { + "file": "你好欢迎_s1.wav", + "duration_s": 1.68, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1332 + }, + { + "file": "阿里云Cos_s0.wav", + "duration_s": 6.76, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1421 + }, + { + "file": "阿里云Cos_s1.wav", + "duration_s": 7.24, + "rms_db": 0.0, + "whisper_cer": 1.0, + "whisper_text": "", + "secs": -0.1422 + } + ] + } +] \ No newline at end of file diff --git a/eval/quality_report.md b/eval/quality_report.md new file mode 100644 index 000000000..5ff0fd5b1 --- /dev/null +++ b/eval/quality_report.md @@ -0,0 +1,15 @@ +# Audio quality across optimization rounds + +| round | n | avg CER | avg SECS | avg RMS dB | avg dur s | flag | +|---|---:|---:|---:|---:|---:|---| +| round0_baseline | 4 | 0.2536 | 0.6065 | -21.645 | 4.33 | - | +| round10_hift_fp32 | 4 | 0.2339 | 0.6153 | -20.29 | 4.11 | - | +| round11_hift_fp16_snake32 | 4 | 0.2339 | 0.6139 | -20.23 | 4.11 | - | +| round12_hift_fp16_precise | 4 | 0.2339 | 0.6148 | -20.23 | 4.11 | - | +| round13_hift_fp16_clamped | 4 | 0.2339 | 0.6148 | -20.23 | 4.11 | - | +| round1_fp16 | 5 | 0.1835 | 0.6719 | -20.026 | 7.744 | - | +| round2_vllm | 4 | 0.2143 | 0.6764 | -21.1125 | 4.12 | - | +| round3_lockfree | 4 | 0.2697 | 0.6617 | -20.415 | 4.19 | - | +| round6_hift_trt | 4 | 1.0 | -0.1377 | 0.0 | 4.53 | INTELLIGIBILITY VOICE | +| round7_fixed | 4 | 0.2339 | 0.6151 | -20.29 | 4.11 | - | +| round7_flow_concurrent | 4 | 1.0 | -0.1376 | 0.0 | 4.27 | INTELLIGIBILITY VOICE | diff --git a/fe_cache.py b/fe_cache.py new file mode 100644 index 000000000..e2edc7a42 --- /dev/null +++ b/fe_cache.py @@ -0,0 +1,60 @@ +"""Frontend prompt-cache patch for CosyVoice3. + +Reuses (speech_token, speech_feat, embedding, prompt_text_token) when the +same (prompt_text, prompt_wav) combination is requested again. + +Usage: + from fe_cache import enable_fe_cache + enable_fe_cache(model) +""" +import threading + +_cache_lock = threading.Lock() +_cache = {} + + +def _key(prompt_text, prompt_wav): + return (prompt_text, prompt_wav) + + +def enable_fe_cache(model): + fe = model.frontend + orig = fe.frontend_zero_shot + + def cached_frontend_zero_shot(tts_text, prompt_text, prompt_wav, resample_rate, zero_shot_spk_id): + # When using a registered speaker id, original code already takes a fast path. + if zero_shot_spk_id != '': + return orig(tts_text, prompt_text, prompt_wav, resample_rate, zero_shot_spk_id) + + k = _key(prompt_text, prompt_wav) + with _cache_lock: + cached = _cache.get(k) + + if cached is None: + # Cold path: do full work, then cache the prompt-side outputs. + model_input = orig(tts_text, prompt_text, prompt_wav, resample_rate, zero_shot_spk_id) + cached = { + 'prompt_text': model_input['prompt_text'], + 'prompt_text_len': model_input['prompt_text_len'], + 'llm_prompt_speech_token': model_input['llm_prompt_speech_token'], + 'llm_prompt_speech_token_len': model_input['llm_prompt_speech_token_len'], + 'flow_prompt_speech_token': model_input['flow_prompt_speech_token'], + 'flow_prompt_speech_token_len': model_input['flow_prompt_speech_token_len'], + 'prompt_speech_feat': model_input['prompt_speech_feat'], + 'prompt_speech_feat_len': model_input['prompt_speech_feat_len'], + 'llm_embedding': model_input['llm_embedding'], + 'flow_embedding': model_input['flow_embedding'], + } + with _cache_lock: + _cache[k] = cached + return model_input + + # Warm path: tokenize tts_text only, splice with cached prompt-side. + tts_text_token, tts_text_token_len = fe._extract_text_token(tts_text) + model_input = dict(cached) + model_input['text'] = tts_text_token + model_input['text_len'] = tts_text_token_len + return model_input + + fe.frontend_zero_shot = cached_frontend_zero_shot + return _cache # expose for inspection diff --git a/load_test.py b/load_test.py new file mode 100644 index 000000000..2c20c89fc --- /dev/null +++ b/load_test.py @@ -0,0 +1,69 @@ +"""Concurrent load test against the FastAPI TTS server.""" +import time, argparse, statistics, json +from concurrent.futures import ThreadPoolExecutor, as_completed +import urllib.request + +TEXTS = [ + '你好,欢迎测试这个 TTS 接口的并发处理能力。', + '阿里云 CosyVoice 三号模型是当前最先进的开源语音合成系统之一。', + '今天我们要测试一下这个接口在高并发场景下能够处理多少请求。', + '语音合成技术已经发展到了非常成熟的阶段,听起来自然流畅。', + '人工智能正在改变我们的生活,从语音到图像,应用无处不在。', +] + + +def one_request(url, idx): + text = TEXTS[idx % len(TEXTS)] + body = json.dumps({'text': text, 'seed': idx}).encode('utf-8') + req = urllib.request.Request(url, data=body, + headers={'Content-Type': 'application/json'}, + method='POST') + t0 = time.time() + with urllib.request.urlopen(req, timeout=120) as resp: + audio = resp.read() + h = dict(resp.headers) + return { + 'idx': idx, + 'wall': time.time() - t0, + 'audio_seconds': float(h.get('x-audio-seconds', 0)), + 'server_wall': float(h.get('x-wall-seconds', 0)), + 'server_rtf': float(h.get('x-rtf', 0)), + 'bytes': len(audio), + } + + +def run(url, concurrency, total): + print(f'concurrency={concurrency} total_requests={total}', flush=True) + t0 = time.time() + results = [] + with ThreadPoolExecutor(max_workers=concurrency) as ex: + futures = [ex.submit(one_request, url, i) for i in range(total)] + for fut in as_completed(futures): + try: + results.append(fut.result()) + except Exception as e: + print(f'request failed: {e}', flush=True) + elapsed = time.time() - t0 + + walls = [r['wall'] for r in results] + audios = [r['audio_seconds'] for r in results] + qps = len(results) / elapsed + audio_total = sum(audios) + rt_throughput = audio_total / elapsed + walls.sort() + p50 = walls[len(walls) // 2] + p95 = walls[int(len(walls) * 0.95)] if len(walls) > 1 else walls[0] + print(f' wall={elapsed:.2f}s | QPS={qps:.2f} | audio_throughput={rt_throughput:.2f}x realtime', flush=True) + print(f' client latency: avg={statistics.mean(walls):.2f}s p50={p50:.2f}s p95={p95:.2f}s max={max(walls):.2f}s', flush=True) + print(f' audio generated: total={audio_total:.1f}s avg_per_req={statistics.mean(audios):.2f}s', flush=True) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument('--url', default='http://127.0.0.1:8000/tts') + p.add_argument('--concurrency', type=int, nargs='+', default=[1, 2, 4, 8]) + p.add_argument('--per-round', type=int, default=8) + args = p.parse_args() + + for c in args.concurrency: + run(args.url, concurrency=c, total=c * args.per_round) diff --git a/load_test_short.py b/load_test_short.py new file mode 100644 index 000000000..0fb60d5cd --- /dev/null +++ b/load_test_short.py @@ -0,0 +1,68 @@ +"""Same as load_test.py but only short Chinese sentences (~2-3s audio each).""" +import time, argparse, statistics, json +from concurrent.futures import ThreadPoolExecutor, as_completed +import urllib.request + +# Short prompts, all under 15 chars / 2-3s audio +SHORT_TEXTS = [ + '你好,我能帮你吗?', + '今天天气真不错。', + '欢迎使用我们的服务。', + '请问有什么需要?', + '谢谢您的反馈。', + '请稍等片刻。', +] + + +def one_request(url, idx): + text = SHORT_TEXTS[idx % len(SHORT_TEXTS)] + body = json.dumps({'text': text, 'seed': idx}).encode('utf-8') + req = urllib.request.Request(url, data=body, + headers={'Content-Type': 'application/json'}, + method='POST') + t0 = time.time() + with urllib.request.urlopen(req, timeout=120) as resp: + audio = resp.read() + h = dict(resp.headers) + return { + 'idx': idx, + 'wall': time.time() - t0, + 'audio_seconds': float(h.get('x-audio-seconds', 0)), + 'server_wall': float(h.get('x-wall-seconds', 0)), + 'bytes': len(audio), + } + + +def run(url, concurrency, total): + print(f'concurrency={concurrency} total_requests={total}', flush=True) + t0 = time.time() + results = [] + with ThreadPoolExecutor(max_workers=concurrency) as ex: + futures = [ex.submit(one_request, url, i) for i in range(total)] + for fut in as_completed(futures): + try: + results.append(fut.result()) + except Exception as e: + print(f'request failed: {e}', flush=True) + elapsed = time.time() - t0 + if not results: + print(' ALL FAILED'); return + walls = sorted([r['wall'] for r in results]) + audios = [r['audio_seconds'] for r in results] + qps = len(results) / elapsed + rt = sum(audios) / elapsed + p50 = walls[len(walls) // 2] + p95 = walls[int(len(walls) * 0.95)] if len(walls) > 1 else walls[0] + print(f' wall={elapsed:.2f}s | QPS={qps:.2f} | audio_throughput={rt:.2f}x realtime', flush=True) + print(f' client latency: avg={statistics.mean(walls):.2f}s p50={p50:.2f}s p95={p95:.2f}s max={max(walls):.2f}s', flush=True) + print(f' avg audio per req: {statistics.mean(audios):.2f}s', flush=True) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument('--url', default='http://127.0.0.1:8000/tts') + p.add_argument('--concurrency', type=int, nargs='+', default=[8, 16, 32, 64]) + p.add_argument('--per-round', type=int, default=4) + args = p.parse_args() + for c in args.concurrency: + run(args.url, concurrency=c, total=c * args.per_round) diff --git a/load_test_stream.py b/load_test_stream.py new file mode 100644 index 000000000..9b97e9bee --- /dev/null +++ b/load_test_stream.py @@ -0,0 +1,155 @@ +"""Streaming TTS load test that measures TTFA (Time To First Audio chunk). + +Uses raw socket-style HTTP client (http.client) so we control when each byte +is consumed. TTFA = wall time from request send to first non-empty body chunk. +""" +import time, argparse, statistics, json, http.client +from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import urlparse + +SHORT_TEXTS = [ + '你好,我能帮你吗?', + '今天天气真不错。', + '欢迎使用我们的服务。', + '请问有什么需要?', + '谢谢您的反馈。', +] +MEDIUM_TEXTS = [ + '收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐。', + '阿里云 CosyVoice 三号是当前开源里最先进的多语言语音合成系统之一,效果非常自然流畅。', + '今天我们来测试这个服务在高并发场景下的延迟和吞吐表现,看看实际生产能力如何。', +] + + +def one_request(host, port, path, idx, texts): + text = texts[idx % len(texts)] + body = json.dumps({'text': text, 'seed': idx}).encode('utf-8') + headers = {'Content-Type': 'application/json', 'Connection': 'close'} + + t_start = time.time() + conn = http.client.HTTPConnection(host, port, timeout=180) + conn.request('POST', path, body=body, headers=headers) + resp = conn.getresponse() + if resp.status != 200: + conn.close() + return {'idx': idx, 'error': f'HTTP {resp.status}'} + + sr = int(resp.headers.get('X-Sample-Rate', 24000)) + bytes_per_sample = 2 # int16 + + # Read first chunk to capture TTFA + first_chunk = resp.read(4096) + if not first_chunk: + conn.close() + return {'idx': idx, 'error': 'empty stream'} + t_first = time.time() + + total_bytes = len(first_chunk) + while True: + chunk = resp.read(8192) + if not chunk: + break + total_bytes += len(chunk) + t_end = time.time() + conn.close() + + audio_sec = total_bytes / bytes_per_sample / sr + return { + 'idx': idx, + 'ttfa': t_first - t_start, + 'wall': t_end - t_start, + 'audio_seconds': audio_sec, + 'bytes': total_bytes, + } + + +def run(url, concurrency, total, texts, label): + parsed = urlparse(url) + host, port, path = parsed.hostname, parsed.port or 80, parsed.path + + print(f'[{label}] conc={concurrency:>3} n={total:>3}', end='', flush=True) + t0 = time.time() + results, errors = [], [] + with ThreadPoolExecutor(max_workers=concurrency) as ex: + futures = [ex.submit(one_request, host, port, path, i, texts) for i in range(total)] + for fut in as_completed(futures): + try: + r = fut.result() + if 'error' in r: + errors.append(r['error']) + else: + results.append(r) + except Exception as e: + errors.append(str(e)) + elapsed = time.time() - t0 + + if not results: + print(f' | ALL FAILED ({len(errors)} errors)') + return None + + def pct(xs, p): + xs = sorted(xs) + return xs[int(len(xs) * p)] if len(xs) > 1 else xs[0] + + walls = [r['wall'] for r in results] + ttfas = [r['ttfa'] for r in results] + audios = [r['audio_seconds'] for r in results] + qps = len(results) / elapsed + rt = sum(audios) / elapsed + + out = { + 'label': label, + 'concurrency': concurrency, + 'requests_ok': len(results), + 'errors': len(errors), + 'wall_total_s': elapsed, + 'qps': qps, + 'audio_throughput_x': rt, + 'avg_audio_per_req_s': statistics.mean(audios), + 'ttfa_p50_ms': pct(ttfas, 0.50) * 1000, + 'ttfa_p95_ms': pct(ttfas, 0.95) * 1000, + 'lat_p50_s': pct(walls, 0.50), + 'lat_p95_s': pct(walls, 0.95), + } + print(f' | QPS={out["qps"]:.2f} thru={out["audio_throughput_x"]:.1f}x' + f' | TTFA p50={out["ttfa_p50_ms"]:.0f}ms p95={out["ttfa_p95_ms"]:.0f}ms' + f' | lat p50={out["lat_p50_s"]:.2f}s p95={out["lat_p95_s"]:.2f}s' + f' | errors={out["errors"]}', flush=True) + return out + + +def sweep(url, label, texts, concurrencies, per_round=4): + rows = [] + for c in concurrencies: + out = run(url, c, c * per_round, texts, label) + if out: rows.append(out) + return rows + + +def print_table(rows, title): + print(f'\n=== {title} ===') + print(f'{"conc":>5} | {"QPS":>6} | {"thru":>6} | {"ttfa50":>7} | {"ttfa95":>7} | {"lat50":>7} | {"lat95":>7} | {"err":>3}') + for r in rows: + print(f'{r["concurrency"]:>5} | {r["qps"]:>6.2f} | {r["audio_throughput_x"]:>5.2f}x ' + f'| {r["ttfa_p50_ms"]:>6.0f}ms | {r["ttfa_p95_ms"]:>6.0f}ms ' + f'| {r["lat_p50_s"]:>6.2f}s | {r["lat_p95_s"]:>6.2f}s ' + f'| {r["errors"]:>3}') + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument('--url', default='http://127.0.0.1:8000/tts/stream') + p.add_argument('--concurrency', type=int, nargs='+', default=[1, 2, 4, 8, 16, 32]) + p.add_argument('--per-round', type=int, default=4) + p.add_argument('--length', choices=['short', 'medium', 'both'], default='both') + args = p.parse_args() + + short_rows = [] + medium_rows = [] + if args.length in ('short', 'both'): + short_rows = sweep(args.url, 'short', SHORT_TEXTS, args.concurrency, args.per_round) + if args.length in ('medium', 'both'): + medium_rows = sweep(args.url, 'medium', MEDIUM_TEXTS, args.concurrency, args.per_round) + + if short_rows: print_table(short_rows, f'SHORT ({len(SHORT_TEXTS[0])}-{max(len(t) for t in SHORT_TEXTS)} chars)') + if medium_rows: print_table(medium_rows, f'MEDIUM ({min(len(t) for t in MEDIUM_TEXTS)}-{max(len(t) for t in MEDIUM_TEXTS)} chars)') diff --git a/probe_hift_shapes.py b/probe_hift_shapes.py new file mode 100644 index 000000000..592db643f --- /dev/null +++ b/probe_hift_shapes.py @@ -0,0 +1,41 @@ +"""Find the exact (T_x_post_conv_pre, T_stft) relationship for several T_mel +inputs, so we can build a TRT optimization profile that satisfies the +internal-Add shape constraints.""" +import sys, torch +sys.path.append('third_party/Matcha-TTS') +from cosyvoice.cli.cosyvoice import AutoModel + + +def main(): + auto = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', + load_trt=False, load_vllm=False, fp16=False) + hift = auto.model.hift + device = next(hift.parameters()).device + hift.f0_predictor.to(torch.float64) + + print('T_mel | T_x_post_conv_pre | T_stft') + pairs = [] + for T_mel in [25, 50, 80, 100, 150, 200, 300, 500]: + mel = torch.randn(1, 80, T_mel, device=device) + f0 = hift.f0_predictor(mel.to(torch.float64), finalize=True).to(mel) + s = hift.f0_upsamp(f0[:, None]).transpose(1, 2) + s, _, _ = hift.m_source(s) + s = s.transpose(1, 2) + x = hift.conv_pre(mel) + s_real, s_imag = hift._stft(s.squeeze(1)) + s_stft = torch.cat([s_real, s_imag], dim=1) + print(f'{T_mel:>5} | {x.shape[2]:>17} | {s_stft.shape[2]:>6}') + pairs.append((T_mel, x.shape[2], s_stft.shape[2])) + + # Fit linear: T_stft = a * T_x + b + import statistics + if len(pairs) >= 2: + xs = [p[1] for p in pairs] + ys = [p[2] for p in pairs] + slope = (ys[-1] - ys[0]) / (xs[-1] - xs[0]) + intercept = ys[0] - slope * xs[0] + print(f'\nFit: T_stft = {slope:.4f} * T_x + {intercept:.4f}') + + +if __name__ == '__main__': + main() diff --git a/profile_deep.py b/profile_deep.py new file mode 100644 index 000000000..9248fb22b --- /dev/null +++ b/profile_deep.py @@ -0,0 +1,283 @@ +"""Deep stage breakdown of CosyVoice3 — every internal method instrumented. + +Records, per request: + Frontend stages: + FE.tokenize_tts - encode tts text + FE.tokenize_prompt - encode prompt text + FE.speech_feat - mel for flow (load wav 24k + mel) + FE.speech_token - speech_tokenizer_v3.onnx (load wav 16k + log_mel + ONNX) + FE.spk_embedding - campplus.onnx (load wav 16k + kaldi.fbank + ONNX) + LLM stages: + LLM.first_token_ms - time from llm_job start to first token yielded + LLM.per_token_ms - mean time between subsequent tokens + LLM.total_ms - whole llm_job duration + LLM.tokens_emitted - count + T2W stages (per chunk, summed if multi): + T2W.flow_ms - flow matching (TRT) + T2W.hift_ms - HiFi-GAN vocoder + T2W.cuda_sync_ms - explicit synchronize after +""" +import sys, time, statistics, threading +sys.path.append('third_party/Matcha-TTS') + +import torch +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model('CosyVoice2ForCausalLM', CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.utils.common import set_all_random_seed + +PROMPT_TEXT = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' +PROMPT_WAV = './asset/zero_shot_prompt.wav' + +TEXTS = { + 'short': '你好,今天天气真不错。', + 'medium': '收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', +} + +# request-id (uuid) keyed records, since llm runs in its own thread but we have uuid +_recs = {} # uuid -> dict +_recs_lock = threading.Lock() +_current_req = threading.local() + + +def _ensure(uuid_): + with _recs_lock: + return _recs.setdefault(uuid_, {}) + + +def _rec(uuid_, key, val): + d = _ensure(uuid_) + d[key] = val + + +def _rec_add(uuid_, key, val): + d = _ensure(uuid_) + d[key] = d.get(key, 0) + val + + +def patch(model): + fe = model.frontend + m = model.model + + # ---- Frontend substages: time the *first* call per request via thread-local ---- + orig_etx = fe._extract_text_token + orig_esp_feat = fe._extract_speech_feat + orig_esp_tok = fe._extract_speech_token + orig_spk_emb = fe._extract_spk_embedding + + def w_etx(text): + t0 = time.perf_counter(); r = orig_etx(text) + # first call = tts text, second = prompt text + u = getattr(_current_req, 'uuid', None) + if u: + d = _ensure(u) + key = 'FE.tokenize_tts' if 'FE.tokenize_tts' not in d else 'FE.tokenize_prompt' + d[key] = (time.perf_counter() - t0) * 1000 + return r + + def w_esp_feat(wav): + t0 = time.perf_counter(); r = orig_esp_feat(wav) + u = getattr(_current_req, 'uuid', None) + if u: _rec(u, 'FE.speech_feat', (time.perf_counter() - t0) * 1000) + return r + + def w_esp_tok(wav): + t0 = time.perf_counter(); r = orig_esp_tok(wav) + u = getattr(_current_req, 'uuid', None) + if u: _rec(u, 'FE.speech_token', (time.perf_counter() - t0) * 1000) + return r + + def w_spk_emb(wav): + t0 = time.perf_counter(); r = orig_spk_emb(wav) + u = getattr(_current_req, 'uuid', None) + if u: _rec(u, 'FE.spk_embedding', (time.perf_counter() - t0) * 1000) + return r + + fe._extract_text_token = w_etx + fe._extract_speech_feat = w_esp_feat + fe._extract_speech_token = w_esp_tok + fe._extract_spk_embedding = w_spk_emb + + # ---- LLM: wrap llm_job to time first-token vs total ---- + orig_llm_job = m.llm_job + + def w_llm_job(text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): + # Count what gets appended to tts_speech_token_dict[uuid] over time + d = _ensure(uuid) + before_len = 0 + first_token_at = None + t0 = time.perf_counter() + # Run original; we observe the dict as it grows via sampling thread + stop_event = threading.Event() + last_count = [0] + first_t = [None] + + def watcher(): + while not stop_event.is_set(): + cur = len(m.tts_speech_token_dict.get(uuid, [])) + if first_t[0] is None and cur > 0: + first_t[0] = time.perf_counter() + last_count[0] = cur + time.sleep(0.005) # 5ms sampling + + watch_thread = threading.Thread(target=watcher, daemon=True) + watch_thread.start() + try: + r = orig_llm_job(text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid) + finally: + stop_event.set() + watch_thread.join(timeout=0.1) + t_end = time.perf_counter() + total_ms = (t_end - t0) * 1000 + n_tokens = last_count[0] + first_ms = ((first_t[0] - t0) * 1000) if first_t[0] else None + d['LLM.total_ms'] = total_ms + d['LLM.tokens'] = n_tokens + d['LLM.first_token_ms'] = first_ms + if n_tokens > 1 and first_t[0] is not None: + d['LLM.per_token_ms'] = ((t_end - first_t[0]) * 1000) / max(n_tokens - 1, 1) + return r + + m.llm_job = w_llm_job + + # ---- T2W: split flow vs hift ---- + orig_flow_inf = m.flow.inference + orig_hift_inf = m.hift.inference + + def w_flow(*a, **kw): + t0 = time.perf_counter(); r = orig_flow_inf(*a, **kw) + if torch.cuda.is_available(): torch.cuda.current_stream().synchronize() + u = getattr(_current_req, 'uuid', None) + if u: _rec_add(u, 'T2W.flow_ms', (time.perf_counter() - t0) * 1000) + return r + + def w_hift(*a, **kw): + t0 = time.perf_counter(); r = orig_hift_inf(*a, **kw) + if torch.cuda.is_available(): torch.cuda.current_stream().synchronize() + u = getattr(_current_req, 'uuid', None) + if u: _rec_add(u, 'T2W.hift_ms', (time.perf_counter() - t0) * 1000) + return r + + m.flow.inference = w_flow + m.hift.inference = w_hift + + # Wrap model.tts to set the current uuid for the request thread + orig_tts = m.tts + + def w_tts(*a, **kw): + # Generate uuid here (matches what tts() does internally); then the inner + # tts() will create its own. We can't easily inject. Instead, set a + # thread-local that the patched submethods use. + # Better: extract uuid by intercepting the call. + import uuid as uuid_mod + # We can't easily pre-create uuid since tts() generates its own. + # Workaround: clear thread-local uuid, then sniff via hift_cache_dict creation. + _current_req.uuid = None + gen = orig_tts(*a, **kw) + for chunk in gen: + # by now, tts() has populated some dicts with this uuid + # find the latest uuid known to model + with m.lock: + if m.tts_speech_token_dict: + # latest is fine for our use + _current_req.uuid = list(m.tts_speech_token_dict.keys())[-1] + yield chunk + + m.tts = w_tts + + +def run_one(model, text, seed, stream=False): + set_all_random_seed(seed) + # Pre-set thread-local uuid will be set inside w_tts + t0 = time.perf_counter() + chunks = 0 + audio_sec = 0.0 + t_first = None + for j in model.inference_zero_shot(text, PROMPT_TEXT, PROMPT_WAV, stream=stream): + if t_first is None: + t_first = time.perf_counter() + chunks += 1 + audio_sec += j['tts_speech'].shape[-1] / model.sample_rate + t_end = time.perf_counter() + u = getattr(_current_req, 'uuid', None) + if u: + d = _ensure(u) + d['_TTFA_ms'] = ((t_first - t0) * 1000) if t_first else None + d['_TOTAL_ms'] = (t_end - t0) * 1000 + d['_CHUNKS'] = chunks + d['_AUDIO_S'] = audio_sec + return u + + +def aggregate(uuids): + by_key = {} + for u in uuids: + d = _recs.get(u, {}) + for k, v in d.items(): + if v is None: continue + by_key.setdefault(k, []).append(v) + return by_key + + +def print_table(label, uuids): + bk = aggregate(uuids) + print(f'\n=== {label} (n={len(uuids)}) ===') + keys_order = [ + 'FE.tokenize_tts', 'FE.tokenize_prompt', + 'FE.speech_feat', 'FE.speech_token', 'FE.spk_embedding', + 'LLM.first_token_ms', 'LLM.per_token_ms', 'LLM.tokens', 'LLM.total_ms', + 'T2W.flow_ms', 'T2W.hift_ms', + '_TTFA_ms', '_TOTAL_ms', '_CHUNKS', '_AUDIO_S', + ] + for k in keys_order: + if k not in bk: continue + vs = bk[k] + if k in ('LLM.tokens', '_CHUNKS'): + print(f' {k:>22} | avg={statistics.mean(vs):8.1f} min={min(vs):.0f} max={max(vs):.0f} n={len(vs)}') + elif k == '_AUDIO_S': + print(f' {k:>22} | avg={statistics.mean(vs):8.2f}s n={len(vs)}') + else: + srt = sorted(vs) + p50 = srt[len(srt) // 2] + p95 = srt[int(len(srt) * 0.95)] if len(srt) > 1 else srt[0] + print(f' {k:>22} | avg={statistics.mean(vs):7.1f}ms p50={p50:7.1f}ms p95={p95:7.1f}ms n={len(vs)}') + + +def main(): + print('Loading CosyVoice3 (TRT + vLLM, fp32) ...', flush=True) + t0 = time.time() + model = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False) + print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + patch(model) + + # warmup + for s in (1000, 1001): + run_one(model, TEXTS['medium'], seed=s, stream=False) + _recs.clear() + + # --- Sync, sequential --- + uuids = [] + for s in range(10, 16): + u = run_one(model, TEXTS['medium'], seed=s, stream=False) + if u: uuids.append(u) + print_table('SYNC medium x6', uuids) + + # --- Stream, sequential --- + uuids = [] + for s in range(20, 26): + u = run_one(model, TEXTS['medium'], seed=s, stream=True) + if u: uuids.append(u) + print_table('STREAM medium x6', uuids) + + # --- Stream, short, sequential --- + uuids = [] + for s in range(30, 36): + u = run_one(model, TEXTS['short'], seed=s, stream=True) + if u: uuids.append(u) + print_table('STREAM short x6', uuids) + + +if __name__ == '__main__': + main() diff --git a/profile_deep_cache.py b/profile_deep_cache.py new file mode 100644 index 000000000..2d9a2f080 --- /dev/null +++ b/profile_deep_cache.py @@ -0,0 +1,53 @@ +"""Same deep profile but with FE cache enabled.""" +import os, sys, time, statistics +sys.path.append('third_party/Matcha-TTS') + +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model('CosyVoice2ForCausalLM', CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +from fe_cache import enable_fe_cache +import profile_deep as PD + + +def main(): + fp16 = os.environ.get('FP16', '1') == '1' + print(f'Loading CosyVoice3 (TRT + vLLM, fp16={fp16}) + FE cache ...', flush=True) + t0 = time.time() + model = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=fp16) + print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + + # IMPORTANT: enable cache BEFORE patching for profile (since cache wraps frontend_zero_shot) + enable_fe_cache(model) + PD.patch(model) + + # warmup also primes the cache + for s in (1000, 1001): + PD.run_one(model, PD.TEXTS['medium'], seed=s, stream=False) + PD._recs.clear() + + # --- Sync, sequential --- + uuids = [] + for s in range(10, 16): + u = PD.run_one(model, PD.TEXTS['medium'], seed=s, stream=False) + if u: uuids.append(u) + PD.print_table('SYNC medium x6 (cached prompt)', uuids) + + # --- Stream, sequential --- + uuids = [] + for s in range(20, 26): + u = PD.run_one(model, PD.TEXTS['medium'], seed=s, stream=True) + if u: uuids.append(u) + PD.print_table('STREAM medium x6 (cached prompt)', uuids) + + # --- Stream, short, sequential --- + uuids = [] + for s in range(30, 36): + u = PD.run_one(model, PD.TEXTS['short'], seed=s, stream=True) + if u: uuids.append(u) + PD.print_table('STREAM short x6 (cached prompt)', uuids) + + +if __name__ == '__main__': + main() diff --git a/profile_stages.py b/profile_stages.py new file mode 100644 index 000000000..3a5c74f08 --- /dev/null +++ b/profile_stages.py @@ -0,0 +1,201 @@ +"""Stage-by-stage profiling of CosyVoice3 inference. + +Patches key methods to record per-call timings, runs N inferences, prints a +breakdown table. + +Stages measured: + TN - text_normalize (frontend, CPU + small ONNX) + FE - frontend_zero_shot (audio prompt → mel + token, GPU/ONNX, runs once per req) + LLM - llm_job (vLLM generation thread, background) + T2W* - token2wav per chunk (flow matching + hift vocoder, on each yield) + TTFA - wall-clock from inference start to first yield + TOTAL - wall-clock from inference start to last yield +""" +import sys, time, statistics, threading +sys.path.append('third_party/Matcha-TTS') + +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model('CosyVoice2ForCausalLM', CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.utils.common import set_all_random_seed + +PROMPT_TEXT = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' +PROMPT_WAV = './asset/zero_shot_prompt.wav' + +TEXTS = { + 'short': '你好,今天天气真不错。', + 'medium': '收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', +} + +# global timing stash, threadlocal-ish via thread name +_per_request = {} # thread_name -> dict of stage -> [durations_ms] +_per_request_lock = threading.Lock() + + +def _record(stage, dur_ms): + name = threading.current_thread().name + with _per_request_lock: + d = _per_request.setdefault(name, {}) + d.setdefault(stage, []).append(dur_ms) + + +def patch(model): + fe = model.frontend + m = model.model + + orig_tn = fe.text_normalize + orig_fzs = fe.frontend_zero_shot + orig_llm = m.llm_job + orig_t2w = m.token2wav + + def w_tn(text, *a, **kw): + t0 = time.perf_counter(); r = orig_tn(text, *a, **kw) + # text_normalize returns generator if split=True. Only time list materialization. + if hasattr(r, '__iter__') and not isinstance(r, (str, list)): + r = list(r) + _record('TN', (time.perf_counter() - t0) * 1000) + return r + + def w_fzs(*a, **kw): + t0 = time.perf_counter(); r = orig_fzs(*a, **kw) + _record('FE', (time.perf_counter() - t0) * 1000) + return r + + def w_llm(*a, **kw): + t0 = time.perf_counter(); r = orig_llm(*a, **kw) + _record('LLM', (time.perf_counter() - t0) * 1000) + return r + + def w_t2w(*a, **kw): + t0 = time.perf_counter(); r = orig_t2w(*a, **kw) + _record('T2W', (time.perf_counter() - t0) * 1000) + return r + + fe.text_normalize = w_tn + fe.frontend_zero_shot = w_fzs + m.llm_job = w_llm + m.token2wav = w_t2w + + +def run_one(model, text, seed, stream=False): + threading.current_thread().name = f'req-{seed}-{int(time.time()*1000)%10000}' + set_all_random_seed(seed) + t_start = time.perf_counter() + t_first = None + audio_sec = 0.0 + chunks = 0 + for j in model.inference_zero_shot(text, PROMPT_TEXT, PROMPT_WAV, stream=stream): + if t_first is None: + t_first = time.perf_counter() + chunks += 1 + audio_sec += j['tts_speech'].shape[-1] / model.sample_rate + t_end = time.perf_counter() + name = threading.current_thread().name + with _per_request_lock: + d = _per_request.setdefault(name, {}) + d['TTFA'] = [(t_first - t_start) * 1000] if t_first else [0] + d['TOTAL'] = [(t_end - t_start) * 1000] + d['CHUNKS'] = chunks + d['AUDIO_S'] = audio_sec + return name + + +def summarize(req_names): + """Aggregate per-stage stats across the given requests.""" + stage_totals = {} # stage -> list of total_ms_per_request + chunks_list = [] + audio_list = [] + for n in req_names: + d = _per_request.get(n, {}) + chunks_list.append(d.get('CHUNKS', 0)) + audio_list.append(d.get('AUDIO_S', 0.0)) + for stage, durs in d.items(): + if stage in ('CHUNKS', 'AUDIO_S'): continue + tot = sum(durs) if isinstance(durs, list) else durs + stage_totals.setdefault(stage, []).append(tot) + return stage_totals, chunks_list, audio_list + + +def fmt_row(name, vals): + if not vals: + return f'{name:>6} | n=0' + avg = statistics.mean(vals) + p50 = sorted(vals)[len(vals) // 2] + p95 = sorted(vals)[int(len(vals) * 0.95)] if len(vals) > 1 else vals[0] + return f'{name:>6} | avg={avg:7.1f}ms p50={p50:7.1f}ms p95={p95:7.1f}ms n={len(vals)}' + + +def print_breakdown(label, req_names, expected_audio_per_req=None): + stage_totals, chunks, audios = summarize(req_names) + print(f'\n=== {label} ({len(req_names)} reqs) ===') + if audios: + print(f' avg_audio_per_req={statistics.mean(audios):.2f}s avg_chunks={statistics.mean(chunks):.1f}') + # known stages in order + for s in ['TN', 'FE', 'LLM', 'T2W', 'TTFA', 'TOTAL']: + if s in stage_totals: + print(' ' + fmt_row(s, stage_totals[s])) + + +def main(): + print('Loading CosyVoice3 (TRT + vLLM, fp32) ...', flush=True) + t0 = time.time() + model = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False) + print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + patch(model) + + # warmup + print('Warming up...', flush=True) + for s in (1000, 1001): + run_one(model, TEXTS['medium'], seed=s, stream=False) + _per_request.clear() + + # 1) Cold first request (sync, medium) + n = run_one(model, TEXTS['medium'], seed=1, stream=False) + print_breakdown('SYNC, medium, single request (post-warmup)', [n]) + + # 2) Sequential runs (sync) for steady state + seqs = [] + for s in range(10, 16): + n = run_one(model, TEXTS['medium'], seed=s, stream=False) + seqs.append(n) + print_breakdown('SYNC, medium, sequential x6', seqs) + + # 3) Streaming single request + n = run_one(model, TEXTS['medium'], seed=20, stream=True) + print_breakdown('STREAM, medium, single request', [n]) + + # 4) Streaming x6 sequential to see TTFA stability + seqs = [] + for s in range(30, 36): + n = run_one(model, TEXTS['medium'], seed=s, stream=True) + seqs.append(n) + print_breakdown('STREAM, medium, sequential x6', seqs) + + # 5) Concurrent stream conc=4 to see how stages overlap + print('\n=== CONCURRENT stream, conc=4, n=8 ===') + _per_request.clear() + import queue, threading as th + q = queue.Queue() + for i in range(40, 48): q.put(i) + names = [] + names_lock = th.Lock() + + def worker(): + while True: + try: s = q.get_nowait() + except queue.Empty: return + n = run_one(model, TEXTS['medium'], seed=s, stream=True) + with names_lock: names.append(n) + + t0 = time.time() + threads = [th.Thread(target=worker) for _ in range(4)] + for t in threads: t.start() + for t in threads: t.join() + wall = time.time() - t0 + print_breakdown(f'STREAM, conc=4, total wall={wall:.2f}s', names) + + +if __name__ == '__main__': + main() diff --git a/restart_server.sh b/restart_server.sh new file mode 100644 index 000000000..f59bca5ce --- /dev/null +++ b/restart_server.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Robust server restart - uses setsid to fully detach from parent session +pkill -9 -f server_cosyvoice3 2>/dev/null || true +pkill -9 -f run_server 2>/dev/null || true +sleep 3 +> /home/zhiqiang/server-opt.log +setsid bash -c '/home/zhiqiang/run_server.sh > /home/zhiqiang/server-opt.log 2>&1' < /dev/null > /dev/null 2>&1 & +echo "launched pid=$!" diff --git a/restart_server_simple.sh b/restart_server_simple.sh new file mode 100644 index 000000000..5de44105c --- /dev/null +++ b/restart_server_simple.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Stable restart - NO LD_LIBRARY_PATH cudnn override (which destabilizes WSL) +pkill -9 -f server_cosyvoice3 2>/dev/null || true +sleep 3 +> /home/zhiqiang/server-opt.log +cd /home/zhiqiang/repos/CosyVoice +setsid bash -c '/home/zhiqiang/.venvs/cosyvoice/bin/python -u server_cosyvoice3.py > /home/zhiqiang/server-opt.log 2>&1' < /dev/null > /dev/null 2>&1 & +echo "launched pid=$!" diff --git a/run_server.sh b/run_server.sh new file mode 100644 index 000000000..b4efd9612 --- /dev/null +++ b/run_server.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Launch CosyVoice TTS server with optimizations: +# - LD_LIBRARY_PATH set so onnxruntime-gpu finds cuDNN 8 + cuBLAS / cudart +# - FE prompt-cache enabled (in server_cosyvoice3.py via enable_fe_cache) +# - TRT engine + vLLM continuous batching +# - No model lock (vLLM thread-safe, CosyVoice tolerated) +# +# Usage: bash run_server.sh + +set -euo pipefail + +VENV=/home/zhiqiang/.venvs/cosyvoice +NV=$VENV/lib/python3.10/site-packages/nvidia +REPO=/home/zhiqiang/repos/CosyVoice +LOG=/home/zhiqiang/server.log + +paths=() +for sub in cudnn cublas cuda_runtime curand cufft cusolver cusparse nccl nvjitlink cuda_nvrtc cuda_cupti; do + d="$NV/$sub/lib" + [ -d "$d" ] && paths+=("$d") +done +joined=$(IFS=:; echo "${paths[*]}") +export LD_LIBRARY_PATH="${joined}${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + +export LOAD_TRT=${LOAD_TRT:-1} +export MODEL_DIR=${MODEL_DIR:-pretrained_models/Fun-CosyVoice3-0.5B} + +cd "$REPO" +echo "[launcher] LD_LIBRARY_PATH set, starting server ..." +exec "$VENV/bin/python" -u server_cosyvoice3.py diff --git a/server_cosyvoice3.py b/server_cosyvoice3.py new file mode 100644 index 000000000..90496b104 --- /dev/null +++ b/server_cosyvoice3.py @@ -0,0 +1,210 @@ +"""FastAPI wrapper for CosyVoice3 zero-shot TTS, no model lock. + +Drops the global model lock so vLLM's continuous batching can fuse concurrent +requests at the LLM step. Flow matching + hift may not be strictly thread-safe, +but the in-process concurrent bench ran without crashes — exposing the same +behavior here lets us measure HTTP-side QPS without lock serialization. + +Endpoints: +- GET /health → {"ok": true, "model_loaded": bool} +- POST /tts → {"text": "...", "seed": 0} → wav bytes +- GET /metrics → cumulative request count + audio seconds generated + plus in_flight gauge +""" +import io, os, sys, time, threading +sys.path.append('third_party/Matcha-TTS') + +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.utils.common import set_all_random_seed +from fe_cache import enable_fe_cache +import torchaudio +import torch +from fastapi import FastAPI, HTTPException +from fastapi.responses import Response, StreamingResponse, FileResponse +from pydantic import BaseModel + +PROMPT_TEXT = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' +PROMPT_WAV = './asset/zero_shot_prompt.wav' +MODEL_DIR = os.environ.get('MODEL_DIR', 'pretrained_models/Fun-CosyVoice3-0.5B') +LOAD_TRT = os.environ.get('LOAD_TRT', '1') == '1' +FP16 = os.environ.get('FP16', '1') == '1' + +app = FastAPI(title='CosyVoice3 TTS (lockfree)') +_model = None +_metrics = {'requests': 0, 'audio_seconds': 0.0, 'total_wall_seconds': 0.0, + 'in_flight': 0, 'errors': 0} +_metrics_lock = threading.Lock() + + +class TTSRequest(BaseModel): + text: str + seed: int = 0 + + +@app.on_event('startup') +def load_model(): + global _model + print(f'[startup] loading {MODEL_DIR}, trt={LOAD_TRT}, fp16={FP16} ...', flush=True) + t0 = time.time() + _model = AutoModel(model_dir=MODEL_DIR, load_trt=LOAD_TRT, load_vllm=True, fp16=FP16) + enable_fe_cache(_model) + print(f'[startup] loaded in {time.time()-t0:.2f}s; FE prompt cache enabled', flush=True) + + +@app.get('/') +def index(): + here = os.path.dirname(os.path.abspath(__file__)) + html_path = os.path.join(here, 'web', 'index.html') + if os.path.exists(html_path): + return FileResponse(html_path, media_type='text/html') + raise HTTPException(404, 'web/index.html not found') + + +@app.get('/health') +def health(): + return {'ok': _model is not None, 'model_loaded': _model is not None} + + +@app.get('/metrics') +def metrics(): + with _metrics_lock: + m = {k: v for k, v in _metrics.items() if k != 'ttfa_samples'} + ttfa_samples = list(_metrics.get('ttfa_samples', [])) + m['realtime_factor'] = (m['audio_seconds'] / m['total_wall_seconds']) if m['total_wall_seconds'] > 0 else None + if ttfa_samples: + ttfa_samples.sort() + n = len(ttfa_samples) + m['ttfa_p50_ms'] = round(ttfa_samples[n // 2] * 1000, 1) + m['ttfa_p95_ms'] = round(ttfa_samples[int(n * 0.95)] * 1000, 1) + m['ttfa_p99_ms'] = round(ttfa_samples[int(n * 0.99)] * 1000, 1) if n >= 100 else None + m['ttfa_count'] = n + return m + + +@app.post('/tts') +def tts(req: TTSRequest): + if _model is None: + raise HTTPException(503, 'model not loaded') + if not req.text.strip(): + raise HTTPException(400, 'empty text') + + with _metrics_lock: + _metrics['in_flight'] += 1 + t0 = time.time() + try: + # NB: no lock — relies on vllm thread-safety + tolerated CosyVoice races + set_all_random_seed(req.seed) + chunks = [] + for j in _model.inference_zero_shot(req.text, PROMPT_TEXT, PROMPT_WAV, stream=False): + chunks.append(j['tts_speech']) + except Exception as e: + with _metrics_lock: + _metrics['errors'] += 1 + _metrics['in_flight'] -= 1 + raise HTTPException(500, f'inference failed: {type(e).__name__}: {e}') + wall = time.time() - t0 + + if not chunks: + with _metrics_lock: + _metrics['errors'] += 1 + _metrics['in_flight'] -= 1 + raise HTTPException(500, 'no audio generated') + + audio = torch.cat(chunks, dim=-1) + audio_sec = audio.shape[-1] / _model.sample_rate + + buf = io.BytesIO() + torchaudio.save(buf, audio, _model.sample_rate, format='wav') + buf.seek(0) + + with _metrics_lock: + _metrics['requests'] += 1 + _metrics['audio_seconds'] += audio_sec + _metrics['total_wall_seconds'] += wall + _metrics['in_flight'] -= 1 + + return Response( + content=buf.read(), + media_type='audio/wav', + headers={ + 'X-Audio-Seconds': f'{audio_sec:.3f}', + 'X-Wall-Seconds': f'{wall:.3f}', + 'X-RTF': f'{wall/audio_sec:.3f}', + }, + ) + + +@app.post('/tts/stream') +def tts_stream(req: TTSRequest): + """Streaming TTS: returns chunked raw PCM int16 mono. + + Client should read chunks as they arrive — the time between request send + and first byte received is TTFA (Time To First Audio). + + Sample rate is in `X-Sample-Rate` header. + """ + if _model is None: + raise HTTPException(503, 'model not loaded') + if not req.text.strip(): + raise HTTPException(400, 'empty text') + + sr = _model.sample_rate + text = req.text + seed = req.seed + + with _metrics_lock: + _metrics['in_flight'] += 1 + started = time.time() + state = {'first_chunk_at': None, 'audio_sec': 0.0, 'errored': False} + + def gen(): + try: + set_all_random_seed(seed) + for j in _model.inference_zero_shot(text, PROMPT_TEXT, PROMPT_WAV, stream=True): + tensor = j['tts_speech'].squeeze().contiguous() + # tts_speech is float in [-1, 1]; encode to int16 PCM + int16 = (tensor.clamp(-1, 1) * 32767).to(torch.int16) + pcm_bytes = int16.cpu().numpy().tobytes() + if state['first_chunk_at'] is None: + state['first_chunk_at'] = time.time() + state['audio_sec'] += tensor.shape[-1] / sr + yield pcm_bytes + except Exception as e: + state['errored'] = True + print(f'[stream] inference error: {type(e).__name__}: {e}', flush=True) + # Can't raise HTTPException after streaming started; just log. + finally: + wall = time.time() - started + ttfa = (state['first_chunk_at'] - started) if state['first_chunk_at'] else None + with _metrics_lock: + _metrics['in_flight'] -= 1 + if state['errored']: + _metrics['errors'] += 1 + else: + _metrics['requests'] += 1 + _metrics['audio_seconds'] += state['audio_sec'] + _metrics['total_wall_seconds'] += wall + if ttfa is not None: + _metrics.setdefault('ttfa_samples', []).append(ttfa) + # cap memory: keep last 1000 + if len(_metrics['ttfa_samples']) > 1000: + _metrics['ttfa_samples'] = _metrics['ttfa_samples'][-1000:] + + return StreamingResponse( + gen(), + media_type='audio/L16', + headers={ + 'X-Sample-Rate': str(sr), + 'X-Channels': '1', + 'X-Format': 'int16', + }, + ) + + +if __name__ == '__main__': + import uvicorn + uvicorn.run(app, host='0.0.0.0', port=8000) diff --git a/setup_ld_path.sh b/setup_ld_path.sh new file mode 100644 index 000000000..b0346e88d --- /dev/null +++ b/setup_ld_path.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Expose all NVIDIA CUDA shared libraries from the venv to LD_LIBRARY_PATH +# so onnxruntime-gpu can find libcublasLt, libcudnn, etc. + +VENV_NV="/home/zhiqiang/.venvs/cosyvoice/lib/python3.10/site-packages/nvidia" + +paths=() +for sub in cudnn cublas cuda_runtime curand cufft cusolver cusparse nccl nvjitlink cuda_nvrtc cuda_cupti; do + d="$VENV_NV/$sub/lib" + [ -d "$d" ] && paths+=("$d") +done + +joined=$(IFS=:; echo "${paths[*]}") + +if [ -n "$LD_LIBRARY_PATH" ]; then + export LD_LIBRARY_PATH="$joined:$LD_LIBRARY_PATH" +else + export LD_LIBRARY_PATH="$joined" +fi + +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" diff --git a/slo_analysis.md b/slo_analysis.md new file mode 100644 index 000000000..2d8bef4fd --- /dev/null +++ b/slo_analysis.md @@ -0,0 +1,409 @@ +# CosyVoice3 + vLLM + TRT on RTX 3090 — SLO-based capacity + +**Test setup**: WSL2 Ubuntu 22.04, RTX 3090 24GB, CosyVoice3 via vLLM 0.11.0 + TRT engine, no model lock, FastAPI `/tts/stream` endpoint (raw int16 PCM). Test date: 2026-04-22. + +## Raw sweep data + +### SHORT text (9-10 chars, ~1.6-2.0s audio / request) + +| conc | QPS | audio×rt | TTFA p50 | TTFA p95 | total p50 | total p95 | errors | +|-----:|----:|---------:|---------:|---------:|----------:|----------:|-------:| +| 1 | 0.89 | 1.60x | 1170ms | 1376ms | 1.17s | 1.38s | 0 | +| 2 | 1.33 | 2.67x | 1376ms | 2007ms | 1.38s | 2.01s | 0 | +| **4** | **2.68** | **5.66x** | 1772ms | 2032ms | 1.78s | 2.03s | 0 | +| 8 | 2.71 | 5.27x | 2942ms | 3759ms | 2.97s | 3.76s | 0 | +| 16 | 3.14 | 5.86x | 4772ms | 6928ms | 4.78s | 6.95s | 0 | +| 32 | 2.93 | 5.67x | 9841ms | 15332ms | 9.93s | 15.35s | 0 | + +### MEDIUM text (38-47 chars, ~3-8s audio / request) + +| conc | QPS | audio×rt | TTFA p50 | TTFA p95 | total p50 | total p95 | errors | +|-----:|----:|---------:|---------:|---------:|----------:|----------:|-------:| +| 1 | 0.37 | 3.39x | 1678ms | 1688ms | 2.67s | 2.78s | 0 | +| **2** | **0.67** | **6.04x** | 2144ms | 2456ms | 3.08s | 4.08s | 0 | +| 4 | 0.69 | 6.09x | 3183ms | 4976ms | 5.38s | 9.17s | 0 | +| 8 | 0.80 | 7.05x | 4846ms | 5747ms | 8.90s | 13.23s | 0 | +| 16 | 0.77 | 6.76x | 9344ms | 11778ms | 18.93s | 29.06s | 0 | +| 32 | 0.81 | 7.15x | 18421ms | 21545ms | 37.78s | 56.13s | 0 | + +## Knee-point analysis (where latency starts dominating) + +Per Little's Law, in a stable system: `concurrency = latency × throughput`. The knee is where pushing more concurrency only grows latency without adding throughput. + +- **SHORT**: knee at conc=4 (QPS 2.68). Going to 8 = +1% QPS, +67% TTFA p50. Going to 16 = +17% QPS but +160% TTFA p50. Efficiency dead. +- **MEDIUM**: knee at conc=2 (QPS 0.67). QPS barely climbs past that; latency grows linearly. + +## SLO-bound maximum achievable QPS + +Real production services pick a TTFA target + total-latency target. Here's what's achievable on one RTX 3090 with CosyVoice3+vLLM+TRT: + +| SLO (TTFA p95 / total p95) | Short text QPS | Medium text QPS | Use case | +|---|---:|---:|---| +| TTFA ≤ 300ms, total ≤ 1s | **0** | **0** | Real-time voice agent ❌ not feasible | +| TTFA ≤ 1.5s, total ≤ 3s | ~0.9 (conc=1) | 0 | Voice assistant batching ⚠️ | +| TTFA ≤ 2.5s, total ≤ 4s | ~2.7 (conc=4) | ~0.7 (conc=2) | Near-realtime narration ✅ | +| TTFA ≤ 5s, total ≤ 10s | ~2.7 (conc=8) | ~0.8 (conc=8) | Batch/podcast gen ✅ | +| No SLO, max throughput | ~3.1 (conc=16) | ~0.8 (conc=32) | Offline batch ⚠️ very long tail | + +## What the data says about 200 QPS + +Absolute maximum observed on single 3090: **3.14 QPS** (short text, conc=16, TTFA p95=6.9s). + +To reach 200 QPS with comparable SLO, need: +- **~64× short text throughput** → 64 GPUs, or a 64x faster model +- **~74× medium text throughput** → even more + +Unchanged conclusion: **single RTX 3090 cannot reach 200 QPS with CosyVoice3** under any SLO that allows < 10s latency. + +## Cost lens (¥ per audio hour) + +Single 3090 peak audio throughput (streaming, short text, conc=16): 5.86x realtime. +- 1 GPU·hour produces ~5.86 audio·hours of short-text content +- At ¥1.5/GPU·hour: **~¥0.26 / audio·hour** + +Compare: +- Aliyun Qwen3-TTS API: ~¥1-2/万字符 ≈ ¥2-5/audio·hour (depends on text density) +- Self-hosted CosyVoice3 breaks even at ~3-5 audio·hours/day; beyond that self-host is cheaper + +## Optimization rounds (2026-04-23) + +Apples-to-apples short-text (~9-10 chars, ~1.6s audio/req) on the same WSL+3090 ++ FE-cache + lock-free server. `n=4` per concurrency, so conc=1 p95 includes a +cold-start outlier — focus on p50. + +| Round | Change | conc=1 TTFA p50 | conc=4 TTFA p50 | conc=4 TTFA p95 | conc=4 QPS | conc=4 lat p95 | +|---|---|---:|---:|---:|---:|---:| +| 0 (baseline) | TRT fp32, FE-cache, lock-free | 588 ms | 1141 ms | 2067 ms | 3.39 | 2.09 s | +| **1** | **+ Flow TRT fp16** | **559 ms** (−5%) | **997 ms** (−13%) | **1210 ms** (−41%) | **3.58** (+6%) | **1.21 s** (−42%) | +| **2** | **+ vLLM `gpu_mem=0.6` + chunked-prefill + `max_num_seqs=64`** | **525 ms** (−11%) | 1137 ms (noise) | 1605 ms (−22%) | 3.33 (noise) | 1.61 s (−23%) | +| **3** | **+ Single-thread vllm.step scheduler (lock removed)** | **520 ms** (−12%) | 1115 ms | 1825 ms (−12%) | 3.41 | 1.83 s | +| **6** | **+ HiFi-GAN decoder TRT fp16** (Round 5 spec-decode blocked by `enable_prompt_embeds`) ⚠️ **AUDIO REGRESSION** | **426 ms** (−28%) | **936 ms** (−18%) | 1143 ms (−45%) | **3.99** (+18%) | **1.73 s** (−17%) | +| **7** | **+ Flow TRT `trt_concurrent=4`** (cheap variant of cross-req batching) — speed numbers were measured WITH `LOAD_TRT_HIFT=1`, audio was broken; `round7_fixed/` has the correct samples generated with `LOAD_TRT_HIFT=0` and the same Flow-concurrency win. | **416 ms** (−29%) | **786 ms** (−31%) | 1092 ms (−47%) | **4.68** (+38%) | **1.29 s** (−38%) | + +## ⚠️ Round 6 audio regression (discovered in Round 9 quality eval) + +The Round 6 hift-TRT integration (`LOAD_TRT_HIFT=1`) **produces saturated +audio** -- every sample value clips to `-1.0`. The fp16 TRT engine appears to +explode the magnitude / phase tensors so that PyTorch's `_istft` then +`audio_limit` clamp drives the waveform to the lower rail. Whisper cannot +transcribe these samples (CER = 1.0) and the SECS speaker similarity is +≈ 0 (essentially noise). **Speed numbers in commits 8c8b05f and 29894a7 are +real (the TRT call returns fast), but the audio is unusable.** + +Concrete eval (cpu, base Whisper, ECAPA-TDNN SECS, n=4 per round): + +| round | CER ↓ | SECS ↑ | RMS dB | status | +|---|---:|---:|---:|---| +| round0_baseline | 0.254 | 0.607 | -21.6 | ok | +| round1_fp16 | 0.184 | 0.672 | -20.0 | ok | +| round2_vllm | 0.214 | 0.676 | -21.1 | ok | +| round3_lockfree | 0.270 | 0.662 | -20.4 | ok | +| **round6_hift_trt** | **1.000** | **-0.14** | **0.0** | **broken (saturated)** | +| **round7_flow_concurrent** | **1.000** | **-0.14** | **0.0** | **broken (had hift TRT on)** | +| round7_fixed (`LOAD_TRT_HIFT=0`) | 0.234 | 0.615 | -20.3 | ok | + +(High absolute CER is expected — base Whisper on short Chinese, naive text +normalization. What matters is the regression jump from ~0.25 to 1.00.) + +**Mitigation in this commit:** +- `LOAD_TRT_HIFT` defaults to `0` already in + [cosyvoice/cli/cosyvoice.py:225](cosyvoice/cli/cosyvoice.py#L225); a + warning comment now explains why it should stay off until investigated. +- `round7_fixed/` directory contains the correct R7 audio (Flow concurrency + speedup is unaffected; only the hift TRT path is broken). +- `eval/quality_eval.py` is the framework that caught this; rerun before + any future audio-touching optimization. + +**Open bug to investigate next session:** +- fp16 numerical overflow in Snake activation (`x + (1/α)·sin²(αx)`) — + large `αx` values overflow before saturating. Try keeping Snake layers in + fp32 via TRT `OBEY_PRECISION_CONSTRAINTS`. +- Or: dtype mismatch at engine boundary. Engine outputs fp16; we cast to + fp32 with `.float()` then iSTFT — but maybe the cast comes too late. +- Or: ONNX export of Snake produced wrong constant for `1/α + ε` term. + +## Round 10 — hift TRT fp32 fix (audio correct + still useful speed gain) + +Built the hift engine in fp32 instead of fp16 (env `HIFT_TRT_FP16=0`, +default after this round). The fp16 saturation bug is gone; audio matches +the no-TRT path (`round7_fixed`). Speed vs Round 3 (last clean benchmark): + +| conc | R3 | R10 fp32 hift TRT | Δ | +|---:|---:|---:|---:| +| 4 QPS | 3.41 | **4.97** | **+46%** | +| 4 TTFA p50 | 1115 ms | **743 ms** | **−33%** | +| 8 QPS | 5.81 | 5.74 | −1% | +| 8 TTFA p50 | 1431 ms | **1348 ms** | −6% | +| 16 QPS | 4.54 | **5.60** | +23% | +| 16 TTFA p50 | 3382 ms | **2741 ms** | −19% | + +| round | n | CER | SECS | RMS dB | status | +|---|---:|---:|---:|---:|---| +| round0_baseline | 4 | 0.254 | 0.607 | -21.6 | ok | +| round7_fixed (no hift TRT) | 4 | 0.234 | 0.615 | -20.3 | ok | +| **round10_hift_fp32** | 4 | **0.234** | **0.615** | -20.3 | ok | +| (old) round6_hift_trt fp16 | 4 | 1.000 | -0.14 | 0.0 | broken | + +Same CER and SECS as no-TRT baseline → the fp32 engine is byte-faithful +to the PyTorch reference. fp32 sacrifices the theoretical fp16 ~2x speed +on Snake / ResBlocks but still wins ~20-30% over PyTorch+autocast because +TRT eliminates the Python op-launch overhead and fuses ops better. + +**Default config now**: `LOAD_TRT_HIFT=1 HIFT_TRT_FP16=0 FP16=1 +LOAD_TRT=1 FLOW_TRT_CONCURRENT=4`. + +To revisit fp16 hift later: rebuild the engine with TRT +`OBEY_PRECISION_CONSTRAINTS` flag and per-layer fp32 markings on every +Snake activation — likely cuts hift time ~30% more, but needs careful +layer-name targeting in the engine builder. + +## Round 11 — fp16 hift + Snake-fp32 mixed precision (null result) + +Tried building the hift engine in fp16 with `OBEY_PRECISION_CONSTRAINTS` ++ per-layer fp32 markings on Sin / Pow / Reciprocal / Div ops (the +decomposed Snake activation in ONNX). Hypothesis: protect Snake from fp16 +overflow while letting the heavy Conv / ConvTranspose layers run fp16. + +- Engine built (229 s) with **289 / 3166 layers (9 %) forced to fp32**. +- Audio is correct: CER 0.234, SECS 0.614 — identical to R10 fp32 hift + and to no-hift-TRT baseline. So the Snake-fp32 strategy *fixes* the + saturation bug. +- BUT throughput is **slower than R10 pure-fp32 by 5-15 %** at every + concurrency (conc=4 QPS 4.21 vs 4.97; conc=16 QPS 5.15 vs 5.60). + The repeated fp16↔fp32 cast layers TRT inserts at every Snake + boundary cost more than the fp16 Conv speedup saves on a network + this Snake-heavy. + +Verdict: **R10 (pure fp32 hift) remains production default.** R11 code +infrastructure (`fp32_layer_keywords` arg in `convert_onnx_to_trt`) is +kept for future experiments — better keyword targeting (only the +`Reciprocal` and second `Mul` of each Snake block, not all Sin/Pow ops) +*might* beat fp32, but the marginal win isn't worth the engine-build +complexity right now. + +| Round | hift TRT mode | conc=4 QPS | TTFA p50 | Audio CER | Status | +|---|---|---:|---:|---:|---| +| (no hift TRT) | PyTorch + autocast | 3.41 (R3) | 1115 | 0.270 | baseline | +| 6 | fp16 unconstrained | 3.99 | 936 | 1.000 | broken | +| 10 | fp32 (no Snake fix) | 4.97 | 743 | 0.234 | works | +| 11 | fp16 + sin/pow/recip/div fp32 (broad keywords, 289 layers) | 4.21 | 846 | 0.234 | works but slower | +| 12 | fp16 + `activations` precise keyword (648 layers = 72 Snake × 9 ops) | 4.84 | 758 | 0.234 | works, ≈ fp32 | +| **13** | **fp16 unconstrained + Snake `clamp(inv_alpha, max=6e4)`** | **5.03** | **749** | **0.234** | **production** | + +## Round 12 — precise Snake-fp32 keyword from ONNX node-name probe + +`dump_onnx_nodes.py` showed every Snake op lives under +`//activations./`, so a single substring `'activations'` +exactly targets the Snake math (Reciprocal + Sin + Pow + 4× Mul + 2× Add +× 72 instances = 648 layers, no over-match into ConvTranspose / ResBlock +math). + +R12 numbers are essentially R10 within noise; R12 is a touch faster +than R11 (whose broader keyword set was over-matching Mul/Div/Pow ops in +the cleaner conv chain) but still doesn't beat pure fp32. + +**Why fp16+Snake-fp32 can't beat pure fp32 on this network:** +the 72 Snake activations are *interleaved* through every ResBlock, so +TRT inserts ~144 fp16↔fp32 cast layers (one in / one out per Snake). +The fp16 Conv speedup on the remaining ~80 % of layers is exactly +cancelled by those casts. To actually win in fp16 we'd need to either +(a) replace Snake with a numerically-safe equivalent like +`tanh(αx)`, requiring re-training; or (b) write a custom TRT plugin +that does the Snake math entirely in fp32 inside one kernel, avoiding +the per-op cast overhead. + +**Production config remains R10**: pure fp32 hift TRT engine. + +## Round 13 — Snake `inv_alpha` clamp at the source (real fp16 win) + +Followed up R11/R12 with a diagnostic dump of the trained Snake alpha +values from `hift.pt`: + +``` +overall stats over 10752 alpha values + alpha min=1.6024e-06 max=4.4509e+00 mean=2.2736e-01 + 1/alpha max=6.2369e+05 fp16 max=65504 + values where 1/alpha > 65504 (fp16 overflow): 4 / 10752 + values where 1/alpha > 6500 (close to limit): 56 +``` + +Only 4 outlier channels overflow fp16 — but those 4 channels feed Inf +into the downstream multiply, NaN-poison the magnitude head, and the +iSTFT clamp drives every output sample to ±1.0. **A two-line patch +fixes it at the source**: + +```python +# cosyvoice/transformer/activation.py:Snake.forward +inv_alpha = 1.0 / (alpha + self.no_div_by_zero) +inv_alpha = torch.clamp(inv_alpha, max=6e4) # NEW: fp16-safe (max=65504) +x = x + inv_alpha * pow(sin(x * alpha), 2) +``` + +The clamp affects only the 4 outlier channels (0.04 %); on the other +99.96 % of channels the math is identical (no clamp triggers). After +re-exporting hift ONNX with the patched Snake and rebuilding the fp16 +TRT engine **without any precision constraints**: + +| metric | R10 fp32 (no Snake fix) | R13 fp16 + Snake clamp | +|---|---:|---:| +| Audio CER | 0.234 | 0.234 | +| Audio SECS | 0.615 | 0.615 | +| Engine build | ~30 s | **32 s** (vs R11/R12: 230 s) | +| conc=1 TTFA p50 | 444 ms | **409 ms** (−8 %) | +| conc=4 QPS | 4.97 | **5.03** (+1 %) | +| conc=4 TTFA p95 | 1054 ms | **938 ms** (−11 %) | +| conc=8 QPS | 5.74 | 5.44 (−5 %, noise) | +| conc=16 QPS | 5.60 | 5.24 (−6 %, noise) | + +Same audio, faster engine build, lower tail latency at conc=4 (the +production sweet spot per SLO). Peak QPS at conc=8/16 is within +benchmark noise of R10. **R13 is the new production default**: + +``` +LOAD_TRT=1 FP16=1 LOAD_TRT_HIFT=1 HIFT_TRT_FP16=1 <-- was 0 before R13 +FLOW_TRT_CONCURRENT=4 +``` + +The `fp32_layer_keywords` infrastructure from R11/R12 stays in place +behind env `HIFT_TRT_FP32_KW=1` for the unlikely future case where +re-trained Snake alphas drift back into the overflow range. + +Round 7 details: full Flow cross-request batching needed re-exporting the +TRT engine away from the CFG-baked batch=2 layout (1-2 days of work, +30-50% best-case Flow gain). Instead bumped `trt_concurrent` from 1 to 4, +which is the supported pattern -- 4 dedicated CUDA streams + execution +contexts share the same engine weights (~1 GB extra GPU). Concurrent +requests now run on different streams without re-serializing. + +| conc | Round 6 QPS | Round 7 QPS | Round 6 TTFA p50 | Round 7 TTFA p50 | +|---:|---:|---:|---:|---:| +| 1 | 0.41 | 0.41 | 426 ms | 416 ms | +| 4 | 3.99 | **4.68** (+17%) | 936 ms | **786 ms** (−16%) | +| 8 | **7.22** | 5.55 (−23%, likely contention with hift TRT context) | 1432 | 1481 | +| 16 | 5.19 | **6.63** (+28%) | 3102 | **2542 ms** (−18%) | + +Tradeoff: peak QPS shifts down slightly (7.22 → 6.63) but TTFA at every +SLO-relevant concurrency improves. conc=8 single-point regression looks +like GPU resource contention between 4 Flow contexts and the hift +context; conc=4 (one hift call per Flow call) and conc=16 (already +saturated) both win. + +Round 6 is best at **conc=8: QPS 7.22, TTFA p50 1432 ms, audio throughput +14.03×** real-time -- a clean +24% QPS over Round 3's conc=8 peak (5.81) +with identical TTFA. Cumulative vs Round 0 baseline: + +| metric | Round 0 | Round 6 | Δ | +|---|---:|---:|---:| +| Peak QPS (short, conc=8) | 2.71 | **7.22** | **+166%** | +| Audio throughput @ peak | 5.27× | **14.03×** | **+166%** | +| TTFA p50 @ peak | 2942 ms | **1432 ms** | −51% | +| TTFA p50 @ conc=1 | 1170 ms | **426 ms** | **−64%** | +| TTFA p50 @ conc=4 | 1772 ms | **936 ms** | −47% | + +Round 2 wins are at higher concurrency where the larger KV-cache budget lets +vLLM batch more aggressively (low conc was already saturated): + +| conc | Round 0 QPS | Round 2 QPS | Round 0 TTFA p50 | Round 2 TTFA p50 | Round 2 audio thru | +|---:|---:|---:|---:|---:|---:| +| 8 | 2.71 | **4.44** (+64%) | 2942 ms | 1787 ms (−39%) | 8.4× | +| 16 | 3.14 | **5.33** (+70%) | 4772 ms | 2973 ms (−38%) | **10.04×** | + +Round 3 collapses peak concurrency from 16 → 8 by removing per-thread +`vllm.step()` lock contention (Single-thread vllm scheduler dispatches +tokens to per-uuid queues; clients block on `queue.get()` instead of +holding a global lock + sleep-polling): + +| conc | Round 2 QPS | Round 3 QPS | Round 2 TTFA p50 | Round 3 TTFA p50 | +|---:|---:|---:|---:|---:| +| 8 | 4.44 | **5.81** (+31%) | 1787 ms | **1431 ms** (−20%) | +| 16 | **5.33** | 4.54 (−15%) | 2973 ms | 3382 ms (+14%) | +| 32 | n/a | 4.93 | n/a | 6059 ms | + +Round 3 peak is **5.81 QPS at TTFA 1.4 s** (conc=8) vs Round 2's +**5.33 QPS at TTFA 3.0 s** (conc=16) — same throughput, half the +latency, half the queue depth. The conc=16 regression is the new +GIL-bound bottleneck: dispatching tokens from the scheduler to many +waiting `queue.put()` calls per `vllm.step()` saturates the GIL. + +Effective production capacity (TTFA ≤ 1.5 s SLO): + +| Round | Best conc | QPS | TTFA p50 | +|---:|---:|---:|---:| +| 0 | 4 | 2.68 | 1772 ms (over SLO) | +| 1 | 4 | 3.58 | 997 ms | +| 2 | 4 | 3.33 | 1137 ms | +| 3 | 8 | **5.81** | **1431 ms** | + +## Post-Round-3 stage profile (where the time actually goes) + +Run with FP16=1 + FE-cache + lock-free + scheduler thread, single-stream +sequential, 6 reps post-warmup, on the same 3090. + +**Stream short text (~9 chars, ~2.9 s audio, 1 chunk):** + +| Stage | avg | % of TTFA | +|---|---:|---:| +| LLM total (72 tokens, first=11ms, per=5.2ms) | 382 ms | **62%** | +| Token2Wav first chunk (Flow + HiFi) | ~219 ms | 35% | +| Other (FE cache hit + framing) | ~16 ms | 3% | +| **TTFA** | **601 ms** | 100% | + +**Stream medium text (~50 chars, ~11 s audio, 3.2 chunks):** + +| Stage | per request | % of TOTAL | +|---|---:|---:| +| LLM total | 1428 ms | **80%** | +| T2W.flow_ms (TRT fp16) | 360 ms (~113/chunk) | 20% | +| T2W.hift_ms (PyTorch + autocast fp16) | 297 ms (~93/chunk) | 17% | +| **TOTAL wall** | **1779 ms** | 100% | +| TTFA | 765 ms | — | + +(Flow + HiFi overlap with LLM in stream mode, so TOTAL ≠ sum.) + +## Where the easy wins live (and don't) + +**LLM is now the wall (62% short / 80% medium).** Per-token rate is +already 5.2 ms (192 tok/s) — vLLM continuous batching is doing its job. +Dropping below this for TTFA needs an architectural change: + +| Lever | Expected TTFA gain | Effort | Notes | +|---|---:|---|---| +| Speculative decoding (draft+verify) | -30% LLM, ~−115 ms TTFA | 1-2 days | Need a draft model, vLLM 0.11 supports it | +| HiFi-GAN → TRT fp16 (per Round-4 plan) | -30 to -50 ms/chunk | 1-2 days | Original plan; ROI is small at the new bottleneck shape | +| Flow batching across concurrent reqs | concurrent QPS x2 | 2-3 days | Doesn't move TTFA; lifts ceiling at conc=16+ | +| Smaller TTS model (Kokoro/Piper) | TTFA <300 ms | 3-5 days | Different model, different voice quality | +| Round 3's GIL ceiling at conc=16 | +10-15% QPS at conc≥16 | 4-6 hours | Replace per-uuid Queue with shared epoll-style dispatch | + +**Verdict**: rounds 1-3 captured the cheap wins. Rounds 4+ are +multi-day investments with smaller percentage returns. Pick based on +SLO target: +- TTFA-bound use case (voice agent) → speculative decoding +- Throughput-bound (batch dubbing) → Flow batching +- Both → smaller model + +Notes: +- `enable_prefix_caching=True` was silently ignored — vLLM V1 doesn't support + it together with `enable_prompt_embeds`, so it falls back to off. Kept the + flag for future vLLM versions. +- `max_num_seqs=64` was important: with the 0.2→0.6 mem-util bump, vLLM would + otherwise default to ~256 seqs and reserve KV cache for them upfront, eating + most of the headroom. 64 is enough for our concurrent-stream pattern. + +Round 1 wins where it matters most for production: TTFA p95 and tail latency +collapse (−41% / −42%) because the fp16 Flow engine finishes per-request 30% +faster, draining the per-token-decode queue before a second request can pile up. +p50 gain is more modest because it was already dominated by FE/LLM-prefill +floor (~500 ms), not Flow. + +Audio samples: `samples/round0_baseline/` vs `samples/round1_fp16/` — same +prompts/seeds. Long-text (~120 chars) stability checked, no degradation. + +The upstream warning (`DiT tensorRT fp16 engine have some performance issue`) +did not manifest as user-perceptible artifacts in our test set. + +## Key takeaways + +1. **TTFA is dominated by vLLM prefill + first flow-matching batch**, not by GPU throughput. You cannot tune your way past ~1.2s TTFA on a single 3090 for CosyVoice3. +2. **Throughput saturates early** (conc=4 for short, conc=2 for medium) because the pipeline is "thick" — one request already keeps the GPU warm. +3. **Linear TTFA growth with concurrency** is a queueing effect. vLLM batches at the LLM step, but the *decode* phase isn't fully parallelized in CosyVoice's path. +4. **Streaming vs sync**: streaming trades ~20% throughput (5.9x vs 13.6x from the non-streaming bench) for the ability to report TTFA. Worth it for interactive use cases, skip for pure batch. diff --git a/test_cosyvoice3.py b/test_cosyvoice3.py new file mode 100644 index 000000000..88bc17c61 --- /dev/null +++ b/test_cosyvoice3.py @@ -0,0 +1,25 @@ +import sys, time +sys.path.append('third_party/Matcha-TTS') +from cosyvoice.cli.cosyvoice import AutoModel +import torchaudio + +print('Loading CosyVoice3...', flush=True) +t0 = time.time() +cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=False, load_vllm=False, fp16=False) +print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + +text = '你好,欢迎来到 CosyVoice 三号的世界,今天我们一起来测试一下它的中文合成效果。' +prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' +prompt_wav = './asset/zero_shot_prompt.wav' + +print('Running inference...', flush=True) +t1 = time.time() +total_audio_seconds = 0.0 +for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt_text, prompt_wav, stream=False)): + out = f'/home/zhiqiang/zero_shot_test_{i}.wav' + torchaudio.save(out, j['tts_speech'], cosyvoice.sample_rate) + dur = j['tts_speech'].shape[-1] / cosyvoice.sample_rate + total_audio_seconds += dur + print(f'chunk {i}: saved {out}, audio_dur={dur:.2f}s, sr={cosyvoice.sample_rate}', flush=True) +elapsed = time.time() - t1 +print(f'Inference done in {elapsed:.2f}s, total_audio={total_audio_seconds:.2f}s, RTF={elapsed/total_audio_seconds:.3f}', flush=True) diff --git a/test_cosyvoice3_trt_vllm.py b/test_cosyvoice3_trt_vllm.py new file mode 100644 index 000000000..b7b4be8f2 --- /dev/null +++ b/test_cosyvoice3_trt_vllm.py @@ -0,0 +1,37 @@ +"""CosyVoice3 with TRT+vLLM. First run compiles TRT engine (5-15 min).""" +import sys, time +sys.path.append('third_party/Matcha-TTS') + +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.utils.common import set_all_random_seed +import torchaudio + + +def main(): + print('Loading CosyVoice3 with TRT + vLLM (first run compiles TRT engine, may take 5-15min)...', flush=True) + t0 = time.time() + cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False) + print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + + text = '收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。' + prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' + prompt_wav = './asset/zero_shot_prompt.wav' + + for run in range(5): + set_all_random_seed(run) + t1 = time.time() + total_audio_seconds = 0.0 + for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt_text, prompt_wav, stream=False)): + out = f'/home/zhiqiang/trt_test_{run}_{i}.wav' + torchaudio.save(out, j['tts_speech'], cosyvoice.sample_rate) + total_audio_seconds += j['tts_speech'].shape[-1] / cosyvoice.sample_rate + elapsed = time.time() - t1 + print(f'[run {run}] wall={elapsed:.2f}s audio={total_audio_seconds:.2f}s RTF={elapsed/total_audio_seconds:.3f}', flush=True) + + +if __name__ == '__main__': + main() diff --git a/test_cosyvoice3_vllm.py b/test_cosyvoice3_vllm.py new file mode 100644 index 000000000..7be0cc914 --- /dev/null +++ b/test_cosyvoice3_vllm.py @@ -0,0 +1,36 @@ +import sys, time +sys.path.append('third_party/Matcha-TTS') + +from vllm import ModelRegistry +from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM +ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM) + +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.utils.common import set_all_random_seed +import torchaudio + + +def main(): + print('Loading CosyVoice3 with vLLM...', flush=True) + t0 = time.time() + cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=False, load_vllm=True, fp16=False) + print(f'Loaded in {time.time()-t0:.2f}s', flush=True) + + text = '你好,欢迎来到 CosyVoice 三号的世界,今天我们一起来测试一下它的中文合成效果。' + prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' + prompt_wav = './asset/zero_shot_prompt.wav' + + for run in range(3): + set_all_random_seed(run) + t1 = time.time() + total_audio_seconds = 0.0 + for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt_text, prompt_wav, stream=False)): + out = f'/home/zhiqiang/vllm_test_{run}_{i}.wav' + torchaudio.save(out, j['tts_speech'], cosyvoice.sample_rate) + total_audio_seconds += j['tts_speech'].shape[-1] / cosyvoice.sample_rate + elapsed = time.time() - t1 + print(f'[run {run}] wall={elapsed:.2f}s audio={total_audio_seconds:.2f}s RTF={elapsed/total_audio_seconds:.3f}', flush=True) + + +if __name__ == '__main__': + main() diff --git a/web/index.html b/web/index.html new file mode 100644 index 000000000..0a1afcd3c --- /dev/null +++ b/web/index.html @@ -0,0 +1,324 @@ + + + + + +CosyVoice3 TTS 测试 + + + + +

CosyVoice3 TTS 测试

+
单 RTX 3090 · vLLM 0.11 + TRT · 24kHz mono · 默认音色:希望你以后能够做的比我还好呦
+ +
+ +
+ + + + + +
+
+ + + + seed +
+
+ +
+
本次结果
+
+
TTFA (首音频)
— ms
+
总耗时
— ms
+
音频时长
— s
+
RTF
+
音频字节
— KB
+
分块数
+
+
就绪。
+ + +
+ +
+
服务端累计指标 (/metrics)
+
点击下方按钮刷新
+
+
+ + + + +