Coverage for integrations / service_tools / cosyvoice_tool.py: 35.2%
71 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2CosyVoice 3 TTS tool — multilingual zero-shot voice cloning (GPU).
4Supports: zh, ja, ko, de, es, fr, it, ru, en (9 languages).
5VRAM: 3.5GB model size, 4GB recommended.
6Requires: pip install cosyvoice
8SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the
9worker subprocess entry point. CUDA OOM or DLL crashes stay contained
10in the worker; the parent falls back gracefully.
12Public API (parent):
13 cosyvoice_synthesize(text, language, voice, output_path) → JSON
14 unload_cosyvoice() → None
16Worker entry:
17 python -m integrations.service_tools.cosyvoice_tool
18"""
20import os
21from typing import Optional
23from integrations.service_tools.gpu_worker import ToolWorker
25# Fallback sample rate used by the parent-side default_sample_rate kwarg
26# when the worker response doesn't carry one. The actual sample rate is
27# reported by model.sample_rate at runtime (22050 for CosyVoice3-0.5B).
28SAMPLE_RATE = 22050
30# Default reference voice for CosyVoice zero-shot cloning — same path
31# Nunba used historically.
32_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')
34# CosyVoice 3 lives in a dev clone (not pip). The clone includes a
35# Matcha-TTS dependency that also needs to be on sys.path.
36_COSYVOICE_CLONE = os.path.join(
37 os.path.expanduser('~'), 'PycharmProjects', 'CosyVoice',
38)
39_COSYVOICE_MODEL_DIR = os.path.join(
40 _COSYVOICE_CLONE, 'pretrained_models', 'CosyVoice3-0.5B',
41)
42_COSYVOICE_HF_REPO = 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512'
44# CosyVoice 3 requires every prompt to be prefixed with this token.
45_COSYVOICE_PROMPT_PREFIX = 'You are a helpful assistant.<|endofprompt|>'
47# Trailing silence pad to prevent chopped endings.
48_END_PAD_SECONDS = 0.3
51# ── Worker callbacks (run in subprocess) ──────────────────────────
53def _load():
54 """Load CosyVoice 3 0.5B from the dev clone.
56 Ported from Nunba's _LazyCosyVoice3:
57 - Requires ~/PycharmProjects/CosyVoice clone + its Matcha-TTS deps
58 - Uses cosyvoice.cli.cosyvoice.AutoModel (not the pip CosyVoice class)
59 - Auto-downloads CosyVoice3-0.5B from HuggingFace if missing
60 """
61 import sys
63 if not os.path.isdir(_COSYVOICE_CLONE):
64 raise FileNotFoundError(
65 f"CosyVoice 3 not found at {_COSYVOICE_CLONE} — clone the "
66 f"CosyVoice repo to that path."
67 )
69 # Prepend the clone + its Matcha-TTS bundled dependency to sys.path
70 if _COSYVOICE_CLONE not in sys.path:
71 sys.path.insert(0, _COSYVOICE_CLONE)
72 matcha = os.path.join(_COSYVOICE_CLONE, 'third_party', 'Matcha-TTS')
73 if os.path.isdir(matcha) and matcha not in sys.path:
74 sys.path.insert(0, matcha)
76 from cosyvoice.cli.cosyvoice import AutoModel
78 # Auto-download CosyVoice3 model weights if missing
79 if not os.path.isdir(_COSYVOICE_MODEL_DIR):
80 from huggingface_hub import snapshot_download
81 snapshot_download(_COSYVOICE_HF_REPO, local_dir=_COSYVOICE_MODEL_DIR)
83 return AutoModel(model_dir=_COSYVOICE_MODEL_DIR)
86def _synthesize(model, req: dict) -> dict:
87 text = req.get('text', '')
88 if not text or not text.strip():
89 return {'error': 'Text is required'}
91 output_path = req.get('output_path')
92 if not output_path:
93 return {'error': 'output_path is required'}
95 # CosyVoice 3 requires an explicit assistant prefix token
96 cv3_text = f'{_COSYVOICE_PROMPT_PREFIX}{text}'
98 # Resolve reference voice for zero-shot cloning; fall back to
99 # inference_sft with the first available built-in speaker.
100 ref = req.get('voice')
101 if not ref and os.path.isfile(_DEFAULT_REF_VOICE):
102 ref = _DEFAULT_REF_VOICE
104 audio = None
105 if ref and os.path.isfile(ref):
106 for chunk in model.inference_cross_lingual(cv3_text, ref, stream=False):
107 audio = chunk['tts_speech']
108 break
109 else:
110 spks = model.list_available_spks() if hasattr(model, 'list_available_spks') else []
111 if not spks:
112 return {'error': 'CosyVoice3: no reference voice and no built-in speakers'}
113 spk = spks[0]
114 for chunk in model.inference_sft(cv3_text, spk, stream=False):
115 audio = chunk['tts_speech']
116 break
118 if audio is None:
119 return {'error': 'CosyVoice3: synthesis returned no audio'}
121 # Pad 0.3s silence to prevent chopped endings
122 import torch
123 sr = model.sample_rate
124 pad = torch.zeros(
125 audio.shape[0] if audio.ndim > 1 else 1,
126 int(sr * _END_PAD_SECONDS),
127 dtype=audio.dtype, device=audio.device,
128 )
129 if audio.ndim == 1:
130 audio = audio.unsqueeze(0)
131 audio = torch.cat([audio, pad], dim=-1)
133 import torchaudio
134 torchaudio.save(output_path, audio.cpu(), sr)
136 return {
137 'path': output_path,
138 'duration': round(audio.shape[-1] / sr, 2),
139 'sample_rate': sr,
140 'engine': 'cosyvoice3',
141 'device': 'cuda',
142 'voice': ref or 'default',
143 }
146# ── Parent-side: ToolWorker instance ─────────────────────────────
148_tool = ToolWorker(
149 tool_name='cosyvoice3',
150 tool_module='integrations.service_tools.cosyvoice_tool',
151 vram_budget='tts_cosyvoice3',
152 output_subdir='cosyvoice/output',
153 engine='cosyvoice3',
154 startup_timeout=120.0,
155 request_timeout=120.0,
156)
159def cosyvoice_synthesize(
160 text: str,
161 language: str = 'zh',
162 voice: Optional[str] = None,
163 output_path: Optional[str] = None,
164) -> str:
165 """Synthesize with CosyVoice 3 (9 languages, GPU subprocess)."""
166 return _tool.synthesize(
167 text=text,
168 language=language,
169 voice=voice,
170 output_path=output_path,
171 default_sample_rate=SAMPLE_RATE,
172 )
175def unload_cosyvoice():
176 """Stop the CosyVoice worker subprocess and free its VRAM."""
177 _tool.stop()
180class CosyVoiceTool:
181 """Register CosyVoice as an in-process service tool."""
183 @classmethod
184 def register_functions(cls):
185 from .registry import ServiceToolInfo, service_tool_registry
186 tool_info = ServiceToolInfo(
187 name="cosyvoice",
188 description=(
189 "CosyVoice 3: multilingual zero-shot TTS. "
190 "9 languages (zh/ja/ko/de/es/fr/it/ru/en), 3.5GB VRAM. "
191 "Requires: pip install cosyvoice"
192 ),
193 base_url="inprocess://cosyvoice",
194 endpoints={
195 "synthesize": {
196 "path": "/synthesize",
197 "method": "POST",
198 "description": "Synthesize with CosyVoice 3 (9 languages, GPU).",
199 "params_schema": {
200 "text": {"type": "string"},
201 "language": {"type": "string"},
202 "voice": {"type": "string", "description": "Reference audio path"},
203 },
204 },
205 },
206 tags=["tts", "speech", "voice-cloning", "gpu", "cosyvoice", "multilingual"],
207 timeout=60,
208 )
209 tool_info.is_healthy = True
210 service_tool_registry._tools["cosyvoice"] = tool_info
211 return True
213# NOTE: no `if __name__ == '__main__':` block — the centralized
214# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.