Coverage for integrations / service_tools / cosyvoice_tool.py: 35.2%

71 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2CosyVoice 3 TTS tool — multilingual zero-shot voice cloning (GPU). 

3 

4Supports: zh, ja, ko, de, es, fr, it, ru, en (9 languages). 

5VRAM: 3.5GB model size, 4GB recommended. 

6Requires: pip install cosyvoice 

7 

8SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the 

9worker subprocess entry point. CUDA OOM or DLL crashes stay contained 

10in the worker; the parent falls back gracefully. 

11 

12Public API (parent): 

13 cosyvoice_synthesize(text, language, voice, output_path) → JSON 

14 unload_cosyvoice() → None 

15 

16Worker entry: 

17 python -m integrations.service_tools.cosyvoice_tool 

18""" 

19 

20import os 

21from typing import Optional 

22 

23from integrations.service_tools.gpu_worker import ToolWorker 

24 

25# Fallback sample rate used by the parent-side default_sample_rate kwarg 

26# when the worker response doesn't carry one. The actual sample rate is 

27# reported by model.sample_rate at runtime (22050 for CosyVoice3-0.5B). 

28SAMPLE_RATE = 22050 

29 

30# Default reference voice for CosyVoice zero-shot cloning — same path 

31# Nunba used historically. 

32_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3') 

33 

34# CosyVoice 3 lives in a dev clone (not pip). The clone includes a 

35# Matcha-TTS dependency that also needs to be on sys.path. 

36_COSYVOICE_CLONE = os.path.join( 

37 os.path.expanduser('~'), 'PycharmProjects', 'CosyVoice', 

38) 

39_COSYVOICE_MODEL_DIR = os.path.join( 

40 _COSYVOICE_CLONE, 'pretrained_models', 'CosyVoice3-0.5B', 

41) 

42_COSYVOICE_HF_REPO = 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512' 

43 

44# CosyVoice 3 requires every prompt to be prefixed with this token. 

45_COSYVOICE_PROMPT_PREFIX = 'You are a helpful assistant.<|endofprompt|>' 

46 

47# Trailing silence pad to prevent chopped endings. 

48_END_PAD_SECONDS = 0.3 

49 

50 

51# ── Worker callbacks (run in subprocess) ────────────────────────── 

52 

53def _load(): 

54 """Load CosyVoice 3 0.5B from the dev clone. 

55 

56 Ported from Nunba's _LazyCosyVoice3: 

57 - Requires ~/PycharmProjects/CosyVoice clone + its Matcha-TTS deps 

58 - Uses cosyvoice.cli.cosyvoice.AutoModel (not the pip CosyVoice class) 

59 - Auto-downloads CosyVoice3-0.5B from HuggingFace if missing 

60 """ 

61 import sys 

62 

63 if not os.path.isdir(_COSYVOICE_CLONE): 

64 raise FileNotFoundError( 

65 f"CosyVoice 3 not found at {_COSYVOICE_CLONE} — clone the " 

66 f"CosyVoice repo to that path." 

67 ) 

68 

69 # Prepend the clone + its Matcha-TTS bundled dependency to sys.path 

70 if _COSYVOICE_CLONE not in sys.path: 

71 sys.path.insert(0, _COSYVOICE_CLONE) 

72 matcha = os.path.join(_COSYVOICE_CLONE, 'third_party', 'Matcha-TTS') 

73 if os.path.isdir(matcha) and matcha not in sys.path: 

74 sys.path.insert(0, matcha) 

75 

76 from cosyvoice.cli.cosyvoice import AutoModel 

77 

78 # Auto-download CosyVoice3 model weights if missing 

79 if not os.path.isdir(_COSYVOICE_MODEL_DIR): 

80 from huggingface_hub import snapshot_download 

81 snapshot_download(_COSYVOICE_HF_REPO, local_dir=_COSYVOICE_MODEL_DIR) 

82 

83 return AutoModel(model_dir=_COSYVOICE_MODEL_DIR) 

84 

85 

86def _synthesize(model, req: dict) -> dict: 

87 text = req.get('text', '') 

88 if not text or not text.strip(): 

89 return {'error': 'Text is required'} 

90 

91 output_path = req.get('output_path') 

92 if not output_path: 

93 return {'error': 'output_path is required'} 

94 

95 # CosyVoice 3 requires an explicit assistant prefix token 

96 cv3_text = f'{_COSYVOICE_PROMPT_PREFIX}{text}' 

97 

98 # Resolve reference voice for zero-shot cloning; fall back to 

99 # inference_sft with the first available built-in speaker. 

100 ref = req.get('voice') 

101 if not ref and os.path.isfile(_DEFAULT_REF_VOICE): 

102 ref = _DEFAULT_REF_VOICE 

103 

104 audio = None 

105 if ref and os.path.isfile(ref): 

106 for chunk in model.inference_cross_lingual(cv3_text, ref, stream=False): 

107 audio = chunk['tts_speech'] 

108 break 

109 else: 

110 spks = model.list_available_spks() if hasattr(model, 'list_available_spks') else [] 

111 if not spks: 

112 return {'error': 'CosyVoice3: no reference voice and no built-in speakers'} 

113 spk = spks[0] 

114 for chunk in model.inference_sft(cv3_text, spk, stream=False): 

115 audio = chunk['tts_speech'] 

116 break 

117 

118 if audio is None: 

119 return {'error': 'CosyVoice3: synthesis returned no audio'} 

120 

121 # Pad 0.3s silence to prevent chopped endings 

122 import torch 

123 sr = model.sample_rate 

124 pad = torch.zeros( 

125 audio.shape[0] if audio.ndim > 1 else 1, 

126 int(sr * _END_PAD_SECONDS), 

127 dtype=audio.dtype, device=audio.device, 

128 ) 

129 if audio.ndim == 1: 

130 audio = audio.unsqueeze(0) 

131 audio = torch.cat([audio, pad], dim=-1) 

132 

133 import torchaudio 

134 torchaudio.save(output_path, audio.cpu(), sr) 

135 

136 return { 

137 'path': output_path, 

138 'duration': round(audio.shape[-1] / sr, 2), 

139 'sample_rate': sr, 

140 'engine': 'cosyvoice3', 

141 'device': 'cuda', 

142 'voice': ref or 'default', 

143 } 

144 

145 

146# ── Parent-side: ToolWorker instance ───────────────────────────── 

147 

148_tool = ToolWorker( 

149 tool_name='cosyvoice3', 

150 tool_module='integrations.service_tools.cosyvoice_tool', 

151 vram_budget='tts_cosyvoice3', 

152 output_subdir='cosyvoice/output', 

153 engine='cosyvoice3', 

154 startup_timeout=120.0, 

155 request_timeout=120.0, 

156) 

157 

158 

159def cosyvoice_synthesize( 

160 text: str, 

161 language: str = 'zh', 

162 voice: Optional[str] = None, 

163 output_path: Optional[str] = None, 

164) -> str: 

165 """Synthesize with CosyVoice 3 (9 languages, GPU subprocess).""" 

166 return _tool.synthesize( 

167 text=text, 

168 language=language, 

169 voice=voice, 

170 output_path=output_path, 

171 default_sample_rate=SAMPLE_RATE, 

172 ) 

173 

174 

175def unload_cosyvoice(): 

176 """Stop the CosyVoice worker subprocess and free its VRAM.""" 

177 _tool.stop() 

178 

179 

180class CosyVoiceTool: 

181 """Register CosyVoice as an in-process service tool.""" 

182 

183 @classmethod 

184 def register_functions(cls): 

185 from .registry import ServiceToolInfo, service_tool_registry 

186 tool_info = ServiceToolInfo( 

187 name="cosyvoice", 

188 description=( 

189 "CosyVoice 3: multilingual zero-shot TTS. " 

190 "9 languages (zh/ja/ko/de/es/fr/it/ru/en), 3.5GB VRAM. " 

191 "Requires: pip install cosyvoice" 

192 ), 

193 base_url="inprocess://cosyvoice", 

194 endpoints={ 

195 "synthesize": { 

196 "path": "/synthesize", 

197 "method": "POST", 

198 "description": "Synthesize with CosyVoice 3 (9 languages, GPU).", 

199 "params_schema": { 

200 "text": {"type": "string"}, 

201 "language": {"type": "string"}, 

202 "voice": {"type": "string", "description": "Reference audio path"}, 

203 }, 

204 }, 

205 }, 

206 tags=["tts", "speech", "voice-cloning", "gpu", "cosyvoice", "multilingual"], 

207 timeout=60, 

208 ) 

209 tool_info.is_healthy = True 

210 service_tool_registry._tools["cosyvoice"] = tool_info 

211 return True 

212 

213# NOTE: no `if __name__ == '__main__':` block — the centralized 

214# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.