Coverage for integrations / service_tools / xtts_tool.py: 0.0%

52 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2XTTS-v2 tool — Coqui XTTS-v2, 17 languages with cross-lingual voice cloning. 

3 

4VRAM: ~2.5 GB on GPU; CPU inference is slow but supported. 

5HF: coqui/XTTS-v2 (16-language production checkpoint, hi added in v2). 

6 

7Requires: pip install coqui-tts (the maintained 2026 fork — the 

8original Coqui company shut down, the idiap fork ships fixes since 

9late 2024 and is the recommended path on PyPI). Both packages 

10expose `from TTS.api import TTS` so the import is unchanged. 

11 

12SUBPROCESS ISOLATED: same convention as f5_tts_tool — `_load` + 

13`_synthesize` callbacks; the gpu_worker dispatcher imports this 

14module in a worker subprocess. 

15 

16Public API (parent side): 

17 xtts_synthesize(text, language, voice, output_path) → JSON 

18 unload_xtts() → None 

19""" 

20 

21from typing import Optional 

22 

23import os 

24import sys 

25 

26from integrations.service_tools.gpu_worker import ToolWorker 

27 

28# Default reference voice for XTTS voice cloning — same path Nunba uses 

29# for chatterbox / F5 so existing reference audio Just Works. 

30_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3') 

31 

32# XTTS-v2 supported language codes (ISO 639-1 + 'zh-cn' kept verbatim). 

33# Mirror of the model card; canonical for the router's language gate. 

34# Used both for input validation here and for the language list in 

35# tts_router.ENGINE_REGISTRY. 

36XTTS_LANGUAGES = ( 

37 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 

38 'cs', 'ar', 'zh', 'hu', 'ko', 'ja', 'hi', 

39) 

40 

41 

42def _resolve_xtts_lang(req_lang: Optional[str]) -> str: 

43 """Map an incoming ISO code to the dialect XTTS expects. 

44 

45 Specifically: XTTS uses 'zh-cn' (not 'zh') for Chinese. Everything 

46 else is the bare 2-letter code. 

47 """ 

48 if not req_lang: 

49 return 'en' 

50 code = req_lang.replace('_', '-').split('-')[0].lower() 

51 if code == 'zh': 

52 return 'zh-cn' 

53 return code if code in XTTS_LANGUAGES else 'en' 

54 

55 

56def _load(): 

57 """Load XTTS-v2 once on the best available device. 

58 

59 Uses Coqui's TTS API: 

60 TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) 

61 The first call downloads ~1.5GB of weights into ~/.local/share/tts 

62 (or HF cache) — subsequent loads are fast. 

63 """ 

64 from TTS.api import TTS 

65 

66 # Prefer CUDA when available; XTTS in CPU mode is slow but works. 

67 try: 

68 import torch 

69 gpu = bool(torch.cuda.is_available()) 

70 except Exception: 

71 gpu = False 

72 

73 return TTS('tts_models/multilingual/multi-dataset/xtts_v2', gpu=gpu) 

74 

75 

76def _synthesize(model, req: dict) -> dict: 

77 text = req.get('text', '') 

78 if not text or not text.strip(): 

79 return {'error': 'Text is required'} 

80 

81 output_path = req.get('output_path') 

82 if not output_path: 

83 return {'error': 'output_path is required'} 

84 

85 language = _resolve_xtts_lang(req.get('language', 'en')) 

86 

87 # Resolve reference voice for cloning. XTTS REQUIRES a speaker_wav 

88 # for inference — there's no built-in "default voice" mode, so we 

89 # use the same Lily.mp3 fallback the rest of Nunba uses. 

90 ref_voice = req.get('voice') 

91 if not ref_voice and os.path.isfile(_DEFAULT_REF_VOICE): 

92 ref_voice = _DEFAULT_REF_VOICE 

93 

94 if not ref_voice or not os.path.isfile(ref_voice): 

95 return { 

96 'error': ( 

97 'XTTS-v2 requires a speaker_wav reference; none found ' 

98 f'(checked: {_DEFAULT_REF_VOICE})' 

99 ), 

100 } 

101 

102 model.tts_to_file( 

103 text=text, 

104 file_path=output_path, 

105 speaker_wav=ref_voice, 

106 language=language, 

107 ) 

108 

109 # Probe duration via soundfile (lightweight). XTTS sample rate 

110 # is 24kHz on the multi-dataset checkpoint. 

111 try: 

112 import soundfile as _sf 

113 info = _sf.info(output_path) 

114 duration = round(info.frames / info.samplerate, 2) 

115 sr = info.samplerate 

116 except Exception: 

117 duration = 0.0 

118 sr = 24000 

119 

120 return { 

121 'path': output_path, 

122 'duration': duration, 

123 'sample_rate': sr, 

124 'engine': 'xtts-v2', 

125 'device': 'cuda' if getattr(model, 'gpu', False) else 'cpu', 

126 'language': req.get('language', 'en'), 

127 'voice': ref_voice, 

128 } 

129 

130 

131# ── Parent-side: one ToolWorker instance ───────────────────────── 

132 

133_tool = ToolWorker( 

134 tool_name='xtts_v2', 

135 tool_module='integrations.service_tools.xtts_tool', 

136 vram_budget='tts_xtts_v2', 

137 output_subdir='xtts/output', 

138 engine='xtts-v2', 

139 startup_timeout=180.0, # first-load downloads ~1.5GB of weights 

140 request_timeout=120.0, 

141) 

142 

143 

144def xtts_synthesize( 

145 text: str, 

146 language: str = 'en', 

147 voice: Optional[str] = None, 

148 output_path: Optional[str] = None, 

149) -> str: 

150 """Synthesize speech using XTTS-v2 (17 langs, voice cloning). 

151 

152 Returns JSON. On subprocess crash the response contains 

153 `transient: true` so the caller can fall back. 

154 """ 

155 return _tool.synthesize( 

156 text=text, 

157 language=language, 

158 voice=voice, 

159 output_path=output_path, 

160 ) 

161 

162 

163def unload_xtts(): 

164 """Stop the XTTS worker subprocess and free VRAM.""" 

165 _tool.stop() 

166 

167 

168class XTTSTool: 

169 """Register XTTS-v2 as an in-process service tool.""" 

170 

171 @classmethod 

172 def register_functions(cls): 

173 from .registry import ServiceToolInfo, service_tool_registry 

174 tool_info = ServiceToolInfo( 

175 name="xtts_v2", 

176 description=( 

177 "XTTS-v2: Coqui's multilingual voice-cloning TTS. " 

178 "17 languages, 6-second cloning from any reference clip, " 

179 "~2.5 GB VRAM, cross-lingual transfer. " 

180 "Requires: pip install coqui-tts" 

181 ), 

182 base_url="inprocess://xtts_v2", 

183 endpoints={ 

184 "synthesize": { 

185 "path": "/synthesize", 

186 "method": "POST", 

187 "description": "Synthesize with XTTS-v2 (17 langs, voice clone).", 

188 "params_schema": { 

189 "text": {"type": "string"}, 

190 "language": {"type": "string"}, 

191 "voice": {"type": "string", 

192 "description": "Reference audio path (.wav/.mp3, ≥6s)"}, 

193 }, 

194 }, 

195 }, 

196 tags=["tts", "speech", "voice-cloning", "multilingual", "xtts", "gpu"], 

197 timeout=120, 

198 ) 

199 tool_info.is_healthy = True 

200 service_tool_registry._tools["xtts_v2"] = tool_info 

201 return True 

202 

203 

204# NOTE: no `if __name__ == '__main__':` block — gpu_worker dispatcher 

205# resolves `_load` / `_synthesize` by convention.