Coverage for integrations / service_tools / xtts_tool.py: 0.0%
52 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2XTTS-v2 tool — Coqui XTTS-v2, 17 languages with cross-lingual voice cloning.
4VRAM: ~2.5 GB on GPU; CPU inference is slow but supported.
5HF: coqui/XTTS-v2 (16-language production checkpoint, hi added in v2).
7Requires: pip install coqui-tts (the maintained 2026 fork — the
8original Coqui company shut down, the idiap fork ships fixes since
9late 2024 and is the recommended path on PyPI). Both packages
10expose `from TTS.api import TTS` so the import is unchanged.
12SUBPROCESS ISOLATED: same convention as f5_tts_tool — `_load` +
13`_synthesize` callbacks; the gpu_worker dispatcher imports this
14module in a worker subprocess.
16Public API (parent side):
17 xtts_synthesize(text, language, voice, output_path) → JSON
18 unload_xtts() → None
19"""
21from typing import Optional
23import os
24import sys
26from integrations.service_tools.gpu_worker import ToolWorker
28# Default reference voice for XTTS voice cloning — same path Nunba uses
29# for chatterbox / F5 so existing reference audio Just Works.
30_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')
32# XTTS-v2 supported language codes (ISO 639-1 + 'zh-cn' kept verbatim).
33# Mirror of the model card; canonical for the router's language gate.
34# Used both for input validation here and for the language list in
35# tts_router.ENGINE_REGISTRY.
36XTTS_LANGUAGES = (
37 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',
38 'cs', 'ar', 'zh', 'hu', 'ko', 'ja', 'hi',
39)
42def _resolve_xtts_lang(req_lang: Optional[str]) -> str:
43 """Map an incoming ISO code to the dialect XTTS expects.
45 Specifically: XTTS uses 'zh-cn' (not 'zh') for Chinese. Everything
46 else is the bare 2-letter code.
47 """
48 if not req_lang:
49 return 'en'
50 code = req_lang.replace('_', '-').split('-')[0].lower()
51 if code == 'zh':
52 return 'zh-cn'
53 return code if code in XTTS_LANGUAGES else 'en'
56def _load():
57 """Load XTTS-v2 once on the best available device.
59 Uses Coqui's TTS API:
60 TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
61 The first call downloads ~1.5GB of weights into ~/.local/share/tts
62 (or HF cache) — subsequent loads are fast.
63 """
64 from TTS.api import TTS
66 # Prefer CUDA when available; XTTS in CPU mode is slow but works.
67 try:
68 import torch
69 gpu = bool(torch.cuda.is_available())
70 except Exception:
71 gpu = False
73 return TTS('tts_models/multilingual/multi-dataset/xtts_v2', gpu=gpu)
76def _synthesize(model, req: dict) -> dict:
77 text = req.get('text', '')
78 if not text or not text.strip():
79 return {'error': 'Text is required'}
81 output_path = req.get('output_path')
82 if not output_path:
83 return {'error': 'output_path is required'}
85 language = _resolve_xtts_lang(req.get('language', 'en'))
87 # Resolve reference voice for cloning. XTTS REQUIRES a speaker_wav
88 # for inference — there's no built-in "default voice" mode, so we
89 # use the same Lily.mp3 fallback the rest of Nunba uses.
90 ref_voice = req.get('voice')
91 if not ref_voice and os.path.isfile(_DEFAULT_REF_VOICE):
92 ref_voice = _DEFAULT_REF_VOICE
94 if not ref_voice or not os.path.isfile(ref_voice):
95 return {
96 'error': (
97 'XTTS-v2 requires a speaker_wav reference; none found '
98 f'(checked: {_DEFAULT_REF_VOICE})'
99 ),
100 }
102 model.tts_to_file(
103 text=text,
104 file_path=output_path,
105 speaker_wav=ref_voice,
106 language=language,
107 )
109 # Probe duration via soundfile (lightweight). XTTS sample rate
110 # is 24kHz on the multi-dataset checkpoint.
111 try:
112 import soundfile as _sf
113 info = _sf.info(output_path)
114 duration = round(info.frames / info.samplerate, 2)
115 sr = info.samplerate
116 except Exception:
117 duration = 0.0
118 sr = 24000
120 return {
121 'path': output_path,
122 'duration': duration,
123 'sample_rate': sr,
124 'engine': 'xtts-v2',
125 'device': 'cuda' if getattr(model, 'gpu', False) else 'cpu',
126 'language': req.get('language', 'en'),
127 'voice': ref_voice,
128 }
131# ── Parent-side: one ToolWorker instance ─────────────────────────
133_tool = ToolWorker(
134 tool_name='xtts_v2',
135 tool_module='integrations.service_tools.xtts_tool',
136 vram_budget='tts_xtts_v2',
137 output_subdir='xtts/output',
138 engine='xtts-v2',
139 startup_timeout=180.0, # first-load downloads ~1.5GB of weights
140 request_timeout=120.0,
141)
144def xtts_synthesize(
145 text: str,
146 language: str = 'en',
147 voice: Optional[str] = None,
148 output_path: Optional[str] = None,
149) -> str:
150 """Synthesize speech using XTTS-v2 (17 langs, voice cloning).
152 Returns JSON. On subprocess crash the response contains
153 `transient: true` so the caller can fall back.
154 """
155 return _tool.synthesize(
156 text=text,
157 language=language,
158 voice=voice,
159 output_path=output_path,
160 )
163def unload_xtts():
164 """Stop the XTTS worker subprocess and free VRAM."""
165 _tool.stop()
168class XTTSTool:
169 """Register XTTS-v2 as an in-process service tool."""
171 @classmethod
172 def register_functions(cls):
173 from .registry import ServiceToolInfo, service_tool_registry
174 tool_info = ServiceToolInfo(
175 name="xtts_v2",
176 description=(
177 "XTTS-v2: Coqui's multilingual voice-cloning TTS. "
178 "17 languages, 6-second cloning from any reference clip, "
179 "~2.5 GB VRAM, cross-lingual transfer. "
180 "Requires: pip install coqui-tts"
181 ),
182 base_url="inprocess://xtts_v2",
183 endpoints={
184 "synthesize": {
185 "path": "/synthesize",
186 "method": "POST",
187 "description": "Synthesize with XTTS-v2 (17 langs, voice clone).",
188 "params_schema": {
189 "text": {"type": "string"},
190 "language": {"type": "string"},
191 "voice": {"type": "string",
192 "description": "Reference audio path (.wav/.mp3, ≥6s)"},
193 },
194 },
195 },
196 tags=["tts", "speech", "voice-cloning", "multilingual", "xtts", "gpu"],
197 timeout=120,
198 )
199 tool_info.is_healthy = True
200 service_tool_registry._tools["xtts_v2"] = tool_info
201 return True
204# NOTE: no `if __name__ == '__main__':` block — gpu_worker dispatcher
205# resolves `_load` / `_synthesize` by convention.