Coverage for integrations/service_tools/xtts

1"""

2XTTS-v2 tool — Coqui XTTS-v2, 17 languages with cross-lingual voice cloning.

4VRAM: ~2.5 GB on GPU; CPU inference is slow but supported.

5HF: coqui/XTTS-v2 (16-language production checkpoint, hi added in v2).

7Requires: pip install coqui-tts (the maintained 2026 fork — the

8original Coqui company shut down, the idiap fork ships fixes since

9late 2024 and is the recommended path on PyPI). Both packages

10expose `from TTS.api import TTS` so the import is unchanged.

12SUBPROCESS ISOLATED: same convention as f5_tts_tool — `_load` +

13`_synthesize` callbacks; the gpu_worker dispatcher imports this

14module in a worker subprocess.

16Public API (parent side):

17 xtts_synthesize(text, language, voice, output_path) → JSON

18 unload_xtts() → None

19"""

21from typing import Optional

23import os

24import sys

26from integrations.service_tools.gpu_worker import ToolWorker

28# Default reference voice for XTTS voice cloning — same path Nunba uses

29# for chatterbox / F5 so existing reference audio Just Works.

30_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')

32# XTTS-v2 supported language codes (ISO 639-1 + 'zh-cn' kept verbatim).

33# Mirror of the model card; canonical for the router's language gate.

34# Used both for input validation here and for the language list in

35# tts_router.ENGINE_REGISTRY.

36XTTS_LANGUAGES = (

37 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',

38 'cs', 'ar', 'zh', 'hu', 'ko', 'ja', 'hi',

39)

42def _resolve_xtts_lang(req_lang: Optional[str]) -> str:

43 """Map an incoming ISO code to the dialect XTTS expects.

45 Specifically: XTTS uses 'zh-cn' (not 'zh') for Chinese. Everything

46 else is the bare 2-letter code.

47 """

48 if not req_lang:

49 return 'en'

50 code = req_lang.replace('_', '-').split('-')[0].lower()

51 if code == 'zh':

52 return 'zh-cn'

53 return code if code in XTTS_LANGUAGES else 'en'

56def _load():

57 """Load XTTS-v2 once on the best available device.

59 Uses Coqui's TTS API:

60 TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

61 The first call downloads ~1.5GB of weights into ~/.local/share/tts

62 (or HF cache) — subsequent loads are fast.

63 """

64 from TTS.api import TTS

66 # Prefer CUDA when available; XTTS in CPU mode is slow but works.

67 try:

68 import torch

69 gpu = bool(torch.cuda.is_available())

70 except Exception:

71 gpu = False

73 return TTS('tts_models/multilingual/multi-dataset/xtts_v2', gpu=gpu)

76def _synthesize(model, req: dict) -> dict:

77 text = req.get('text', '')

78 if not text or not text.strip():

79 return {'error': 'Text is required'}

81 output_path = req.get('output_path')

82 if not output_path:

83 return {'error': 'output_path is required'}

85 language = _resolve_xtts_lang(req.get('language', 'en'))

87 # Resolve reference voice for cloning. XTTS REQUIRES a speaker_wav

88 # for inference — there's no built-in "default voice" mode, so we

89 # use the same Lily.mp3 fallback the rest of Nunba uses.

90 ref_voice = req.get('voice')

91 if not ref_voice and os.path.isfile(_DEFAULT_REF_VOICE):

92 ref_voice = _DEFAULT_REF_VOICE

94 if not ref_voice or not os.path.isfile(ref_voice):

95 return {

96 'error': (

97 'XTTS-v2 requires a speaker_wav reference; none found '

98 f'(checked: {_DEFAULT_REF_VOICE})'

99 ),

100 }

101

102 model.tts_to_file(

103 text=text,

104 file_path=output_path,

105 speaker_wav=ref_voice,

106 language=language,

107 )

108

109 # Probe duration via soundfile (lightweight). XTTS sample rate

110 # is 24kHz on the multi-dataset checkpoint.

111 try:

112 import soundfile as _sf

113 info = _sf.info(output_path)

114 duration = round(info.frames / info.samplerate, 2)

115 sr = info.samplerate

116 except Exception:

117 duration = 0.0

118 sr = 24000

119

120 return {

121 'path': output_path,

122 'duration': duration,

123 'sample_rate': sr,

124 'engine': 'xtts-v2',

125 'device': 'cuda' if getattr(model, 'gpu', False) else 'cpu',

126 'language': req.get('language', 'en'),

127 'voice': ref_voice,

128 }

129

130

131# ── Parent-side: one ToolWorker instance ─────────────────────────

132

133_tool = ToolWorker(

134 tool_name='xtts_v2',

135 tool_module='integrations.service_tools.xtts_tool',

136 vram_budget='tts_xtts_v2',

137 output_subdir='xtts/output',

138 engine='xtts-v2',

139 startup_timeout=180.0, # first-load downloads ~1.5GB of weights

140 request_timeout=120.0,

141)

142

143

144def xtts_synthesize(

145 text: str,

146 language: str = 'en',

147 voice: Optional[str] = None,

148 output_path: Optional[str] = None,

149) -> str:

150 """Synthesize speech using XTTS-v2 (17 langs, voice cloning).

151

152 Returns JSON. On subprocess crash the response contains

153 `transient: true` so the caller can fall back.

154 """

155 return _tool.synthesize(

156 text=text,

157 language=language,

158 voice=voice,

159 output_path=output_path,

160 )

161

162

163def unload_xtts():

164 """Stop the XTTS worker subprocess and free VRAM."""

165 _tool.stop()

166

167

168class XTTSTool:

169 """Register XTTS-v2 as an in-process service tool."""

170

171 @classmethod

172 def register_functions(cls):

173 from .registry import ServiceToolInfo, service_tool_registry

174 tool_info = ServiceToolInfo(

175 name="xtts_v2",

176 description=(

177 "XTTS-v2: Coqui's multilingual voice-cloning TTS. "

178 "17 languages, 6-second cloning from any reference clip, "

179 "~2.5 GB VRAM, cross-lingual transfer. "

180 "Requires: pip install coqui-tts"

181 ),

182 base_url="inprocess://xtts_v2",

183 endpoints={

184 "synthesize": {

185 "path": "/synthesize",

186 "method": "POST",

187 "description": "Synthesize with XTTS-v2 (17 langs, voice clone).",

188 "params_schema": {

189 "text": {"type": "string"},

190 "language": {"type": "string"},

191 "voice": {"type": "string",

192 "description": "Reference audio path (.wav/.mp3, ≥6s)"},

193 },

194 },

195 },

196 tags=["tts", "speech", "voice-cloning", "multilingual", "xtts", "gpu"],

197 timeout=120,

198 )

199 tool_info.is_healthy = True

200 service_tool_registry._tools["xtts_v2"] = tool_info

201 return True

202

203

204# NOTE: no `if __name__ == '__main__':` block — gpu_worker dispatcher

205# resolves `_load` / `_synthesize` by convention.

Coverage for integrations / service_tools / xtts_tool.py: 0.0%

52 statements