Coverage for integrations/service_tools/cosyvoice

1"""

2CosyVoice 3 TTS tool — multilingual zero-shot voice cloning (GPU).

4Supports: zh, ja, ko, de, es, fr, it, ru, en (9 languages).

5VRAM: 3.5GB model size, 4GB recommended.

6Requires: pip install cosyvoice

8SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the

9worker subprocess entry point. CUDA OOM or DLL crashes stay contained

10in the worker; the parent falls back gracefully.

12Public API (parent):

13 cosyvoice_synthesize(text, language, voice, output_path) → JSON

14 unload_cosyvoice() → None

16Worker entry:

17 python -m integrations.service_tools.cosyvoice_tool

18"""

20import os

21from typing import Optional

23from integrations.service_tools.gpu_worker import ToolWorker

25# Fallback sample rate used by the parent-side default_sample_rate kwarg

26# when the worker response doesn't carry one. The actual sample rate is

27# reported by model.sample_rate at runtime (22050 for CosyVoice3-0.5B).

28SAMPLE_RATE = 22050

30# Default reference voice for CosyVoice zero-shot cloning — same path

31# Nunba used historically.

32_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')

34# CosyVoice 3 lives in a dev clone (not pip). The clone includes a

35# Matcha-TTS dependency that also needs to be on sys.path.

36_COSYVOICE_CLONE = os.path.join(

37 os.path.expanduser('~'), 'PycharmProjects', 'CosyVoice',

38)

39_COSYVOICE_MODEL_DIR = os.path.join(

40 _COSYVOICE_CLONE, 'pretrained_models', 'CosyVoice3-0.5B',

41)

42_COSYVOICE_HF_REPO = 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512'

44# CosyVoice 3 requires every prompt to be prefixed with this token.

45_COSYVOICE_PROMPT_PREFIX = 'You are a helpful assistant.<|endofprompt|>'

47# Trailing silence pad to prevent chopped endings.

48_END_PAD_SECONDS = 0.3

51# ── Worker callbacks (run in subprocess) ──────────────────────────

53def _load():

54 """Load CosyVoice 3 0.5B from the dev clone.

56 Ported from Nunba's _LazyCosyVoice3:

57 - Requires ~/PycharmProjects/CosyVoice clone + its Matcha-TTS deps

58 - Uses cosyvoice.cli.cosyvoice.AutoModel (not the pip CosyVoice class)

59 - Auto-downloads CosyVoice3-0.5B from HuggingFace if missing

60 """

61 import sys

63 if not os.path.isdir(_COSYVOICE_CLONE):

64 raise FileNotFoundError(

65 f"CosyVoice 3 not found at {_COSYVOICE_CLONE} — clone the "

66 f"CosyVoice repo to that path."

67 )

69 # Prepend the clone + its Matcha-TTS bundled dependency to sys.path

70 if _COSYVOICE_CLONE not in sys.path:

71 sys.path.insert(0, _COSYVOICE_CLONE)

72 matcha = os.path.join(_COSYVOICE_CLONE, 'third_party', 'Matcha-TTS')

73 if os.path.isdir(matcha) and matcha not in sys.path:

74 sys.path.insert(0, matcha)

76 from cosyvoice.cli.cosyvoice import AutoModel

78 # Auto-download CosyVoice3 model weights if missing

79 if not os.path.isdir(_COSYVOICE_MODEL_DIR):

80 from huggingface_hub import snapshot_download

81 snapshot_download(_COSYVOICE_HF_REPO, local_dir=_COSYVOICE_MODEL_DIR)

83 return AutoModel(model_dir=_COSYVOICE_MODEL_DIR)

86def _synthesize(model, req: dict) -> dict:

87 text = req.get('text', '')

88 if not text or not text.strip():

89 return {'error': 'Text is required'}

91 output_path = req.get('output_path')

92 if not output_path:

93 return {'error': 'output_path is required'}

95 # CosyVoice 3 requires an explicit assistant prefix token

96 cv3_text = f'{_COSYVOICE_PROMPT_PREFIX}{text}'

98 # Resolve reference voice for zero-shot cloning; fall back to

99 # inference_sft with the first available built-in speaker.

100 ref = req.get('voice')

101 if not ref and os.path.isfile(_DEFAULT_REF_VOICE):

102 ref = _DEFAULT_REF_VOICE

103

104 audio = None

105 if ref and os.path.isfile(ref):

106 for chunk in model.inference_cross_lingual(cv3_text, ref, stream=False):

107 audio = chunk['tts_speech']

108 break

109 else:

110 spks = model.list_available_spks() if hasattr(model, 'list_available_spks') else []

111 if not spks:

112 return {'error': 'CosyVoice3: no reference voice and no built-in speakers'}

113 spk = spks[0]

114 for chunk in model.inference_sft(cv3_text, spk, stream=False):

115 audio = chunk['tts_speech']

116 break

117

118 if audio is None:

119 return {'error': 'CosyVoice3: synthesis returned no audio'}

120

121 # Pad 0.3s silence to prevent chopped endings

122 import torch

123 sr = model.sample_rate

124 pad = torch.zeros(

125 audio.shape[0] if audio.ndim > 1 else 1,

126 int(sr * _END_PAD_SECONDS),

127 dtype=audio.dtype, device=audio.device,

128 )

129 if audio.ndim == 1:

130 audio = audio.unsqueeze(0)

131 audio = torch.cat([audio, pad], dim=-1)

132

133 import torchaudio

134 torchaudio.save(output_path, audio.cpu(), sr)

135

136 return {

137 'path': output_path,

138 'duration': round(audio.shape[-1] / sr, 2),

139 'sample_rate': sr,

140 'engine': 'cosyvoice3',

141 'device': 'cuda',

142 'voice': ref or 'default',

143 }

144

145

146# ── Parent-side: ToolWorker instance ─────────────────────────────

147

148_tool = ToolWorker(

149 tool_name='cosyvoice3',

150 tool_module='integrations.service_tools.cosyvoice_tool',

151 vram_budget='tts_cosyvoice3',

152 output_subdir='cosyvoice/output',

153 engine='cosyvoice3',

154 startup_timeout=120.0,

155 request_timeout=120.0,

156)

157

158

159def cosyvoice_synthesize(

160 text: str,

161 language: str = 'zh',

162 voice: Optional[str] = None,

163 output_path: Optional[str] = None,

164) -> str:

165 """Synthesize with CosyVoice 3 (9 languages, GPU subprocess)."""

166 return _tool.synthesize(

167 text=text,

168 language=language,

169 voice=voice,

170 output_path=output_path,

171 default_sample_rate=SAMPLE_RATE,

172 )

173

174

175def unload_cosyvoice():

176 """Stop the CosyVoice worker subprocess and free its VRAM."""

177 _tool.stop()

178

179

180class CosyVoiceTool:

181 """Register CosyVoice as an in-process service tool."""

182

183 @classmethod

184 def register_functions(cls):

185 from .registry import ServiceToolInfo, service_tool_registry

186 tool_info = ServiceToolInfo(

187 name="cosyvoice",

188 description=(

189 "CosyVoice 3: multilingual zero-shot TTS. "

190 "9 languages (zh/ja/ko/de/es/fr/it/ru/en), 3.5GB VRAM. "

191 "Requires: pip install cosyvoice"

192 ),

193 base_url="inprocess://cosyvoice",

194 endpoints={

195 "synthesize": {

196 "path": "/synthesize",

197 "method": "POST",

198 "description": "Synthesize with CosyVoice 3 (9 languages, GPU).",

199 "params_schema": {

200 "text": {"type": "string"},

201 "language": {"type": "string"},

202 "voice": {"type": "string", "description": "Reference audio path"},

203 },

204 },

205 },

206 tags=["tts", "speech", "voice-cloning", "gpu", "cosyvoice", "multilingual"],

207 timeout=60,

208 )

209 tool_info.is_healthy = True

210 service_tool_registry._tools["cosyvoice"] = tool_info

211 return True

212

213# NOTE: no `if __name__ == '__main__':` block — the centralized

214# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.

Coverage for integrations / service_tools / cosyvoice_tool.py: 35.2%

71 statements