Coverage for integrations / service_tools / mms_tts_tool.py: 0.0%
96 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2MMS-TTS tool — Meta's Massively Multilingual Speech TTS (1100+ languages).
4VRAM: ~1.0 GB on GPU; runs comfortably on CPU too.
5Architecture: VITS (the same flow-based model VITS-MMS papers describe).
6HF: facebook/mms-tts-<iso639-3> (per-language checkpoint, ~150 MB each).
8Requires: only `transformers` (already bundled in Nunba's python-embed
9and HARTOS's main deps). No new pip dep on Linux/macOS.
11For non-Roman script languages (Arabic, Hindi, Mandarin, Korean, ...)
12the upstream VitsTokenizer flags `is_uroman=True` and expects pre-
13romanized input via the `uroman` perl package. This tool detects
14that flag and:
15 - if the optional `uroman` Python wrapper (or the `UROMAN` env var
16 pointing at the perl repo) is present → romanizes automatically;
17 - else returns `{'error': ..., 'transient': true}` so the router
18 falls through to the next engine in the language preference list.
20This keeps MMS as the universal-coverage fallback without breaking
21when uroman isn't installed — the language preference order picks up
22Indic Parler / XTTS / MeloTTS first when they're available.
24SUBPROCESS ISOLATED: same convention as f5_tts_tool / chatterbox_tool.
26Public API (parent side):
27 mms_tts_synthesize(text, language, voice, output_path) → JSON
28 unload_mms_tts() → None
29"""
31from typing import Optional
33import os
34import sys
36from integrations.service_tools.gpu_worker import ToolWorker
38# ── ISO 639-1 → ISO 639-3 mapping for MMS-TTS repos ──────────────
39#
40# MMS-TTS uses 3-letter ISO 639-3 codes (eng / fra / hin / cmn / ...).
41# Nunba and HARTOS speak ISO 639-1 (en / fr / hi / zh). This map is
42# the SINGLE bridge between the two — every language in
43# core.constants.SUPPORTED_LANG_DICT that has a known mms-tts-<iso3>
44# repo is listed below. Codes deliberately NOT mapped here either
45# (a) don't have a HuggingFace mms-tts checkpoint, or (b) use a
46# different ISO3 than the obvious 1↔3 collation and need verification
47# before we route real users through them.
48#
49# Source: facebook/mms-tts model collection on HuggingFace.
51ISO1_TO_ISO3 = {
52 # Major European
53 'en': 'eng', 'es': 'spa', 'fr': 'fra', 'de': 'deu', 'it': 'ita',
54 'pt': 'por', 'nl': 'nld', 'pl': 'pol', 'tr': 'tur', 'ru': 'rus',
55 'cs': 'ces', 'hu': 'hun', 'sv': 'swe', 'fi': 'fin', 'el': 'ell',
56 'ro': 'ron', 'bg': 'bul', 'uk': 'ukr', 'cy': 'cym', 'is': 'isl',
57 # CJK + SEA
58 'zh': 'cmn', 'ja': 'jpn', 'ko': 'kor', 'vi': 'vie', 'th': 'tha',
59 'id': 'ind', 'ms': 'zlm', 'km': 'khm', 'lo': 'lao', 'my': 'mya',
60 # Indic (subset that has explicit mms-tts checkpoints)
61 'hi': 'hin', 'bn': 'ben', 'ta': 'tam', 'te': 'tel', 'mr': 'mar',
62 'gu': 'guj', 'kn': 'kan', 'ml': 'mal', 'pa': 'pan', 'or': 'ory',
63 'ne': 'nep', 'as': 'asm', 'sd': 'snd', 'sa': 'san', 'ur': 'urd',
64 'si': 'sin',
65 # Middle East / Africa
66 'ar': 'ara', 'fa': 'pes', 'he': 'heb', 'sw': 'swh',
67}
70def _iso1_to_iso3(req_lang: Optional[str]) -> Optional[str]:
71 """Return the ISO 639-3 code for a 2-letter language, or None.
73 None means "MMS doesn't have a verified checkpoint for this lang
74 in our mapping" — caller should treat that as 'this engine cannot
75 serve this language' and fall through to the next preference.
76 """
77 if not req_lang:
78 return ISO1_TO_ISO3.get('en')
79 code = req_lang.replace('_', '-').split('-')[0].lower()
80 return ISO1_TO_ISO3.get(code)
83def _try_uromanize(text: str) -> Optional[str]:
84 """Best-effort romanization for non-Roman script input.
86 Returns the romanized string on success, None if uroman is not
87 available in any supported form. The caller treats None as a
88 hard failure for the current request.
90 Order of attempts:
91 1. The `uroman` Python wrapper (`pip install uroman`) — pure
92 Python, no perl required. Modern, easiest path.
93 2. The `UROMAN` env var pointing at the isi-nlp/uroman perl
94 repo (the canonical upstream path documented by HF).
95 """
96 # Pure-Python wrapper first
97 try:
98 import uroman as _uroman_pkg # type: ignore
99 u = _uroman_pkg.Uroman()
100 return u.romanize_string(text)
101 except Exception:
102 pass
104 # Perl repo via UROMAN env var
105 uroman_root = os.environ.get('UROMAN')
106 if uroman_root and os.path.isdir(uroman_root):
107 script = os.path.join(uroman_root, 'bin', 'uroman.pl')
108 if os.path.isfile(script):
109 try:
110 import subprocess
111 proc = subprocess.run(
112 ['perl', script],
113 input=text.encode('utf-8'),
114 capture_output=True,
115 timeout=15,
116 )
117 if proc.returncode == 0:
118 out = proc.stdout.decode('utf-8', errors='replace')
119 return out.rstrip('\n')
120 except Exception:
121 pass
123 return None
126def _load():
127 """Load the default English MMS-TTS checkpoint on the best device.
129 The model+tokenizer pair is per-language, so we cache them in a
130 dict keyed by ISO 639-3 code and lazily load on first request for
131 each language. On English the load is a no-op since `_State`
132 already initialized it.
133 """
134 from transformers import VitsTokenizer, VitsModel
136 try:
137 import torch
138 device = 'cuda' if torch.cuda.is_available() else 'cpu'
139 except Exception:
140 device = 'cpu'
142 repo = 'facebook/mms-tts-eng'
143 tokenizer = VitsTokenizer.from_pretrained(repo)
144 model = VitsModel.from_pretrained(repo)
145 if device == 'cuda':
146 try:
147 model = model.to('cuda')
148 except Exception:
149 device = 'cpu'
151 class _State:
152 def __init__(self_):
153 self_.device = device
154 # iso3 → (tokenizer, model)
155 self_.cache = {'eng': (tokenizer, model)}
157 return _State()
160def _synthesize(state, req: dict) -> dict:
161 text = req.get('text', '')
162 if not text or not text.strip():
163 return {'error': 'Text is required'}
165 output_path = req.get('output_path')
166 if not output_path:
167 return {'error': 'output_path is required'}
169 iso3 = _iso1_to_iso3(req.get('language', 'en'))
170 if not iso3:
171 return {
172 'error': (
173 f"MMS-TTS has no mapped checkpoint for language "
174 f"'{req.get('language')}'"
175 ),
176 'transient': True,
177 }
179 # Lazy-load the per-language model
180 if iso3 not in state.cache:
181 from transformers import VitsTokenizer, VitsModel
182 repo = f'facebook/mms-tts-{iso3}'
183 try:
184 tokenizer = VitsTokenizer.from_pretrained(repo)
185 model = VitsModel.from_pretrained(repo)
186 if state.device == 'cuda':
187 try:
188 model = model.to('cuda')
189 except Exception:
190 pass
191 state.cache[iso3] = (tokenizer, model)
192 except Exception as e:
193 return {
194 'error': f'mms-tts-{iso3} load failed: {e}',
195 'transient': True,
196 }
198 tokenizer, model = state.cache[iso3]
200 # Romanize input text on demand for non-Roman script languages.
201 # The VitsTokenizer.is_uroman flag tells us whether the model was
202 # trained on romanized text. If True and the input contains
203 # non-ASCII, route through uroman first.
204 if getattr(tokenizer, 'is_uroman', False):
205 is_ascii = all(ord(c) < 128 for c in text)
206 if not is_ascii:
207 roman = _try_uromanize(text)
208 if roman is None:
209 return {
210 'error': (
211 f"mms-tts-{iso3} requires uroman for non-Roman "
212 f"input; install `pip install uroman` or set "
213 f"UROMAN env var to the isi-nlp/uroman repo path"
214 ),
215 'transient': True,
216 }
217 text = roman
219 inputs = tokenizer(text=text, return_tensors='pt')
220 if state.device == 'cuda':
221 try:
222 inputs = {k: v.to('cuda') for k, v in inputs.items()}
223 except Exception:
224 pass
226 import torch
227 with torch.no_grad():
228 outputs = model(**inputs)
229 waveform = outputs.waveform[0].detach().cpu().numpy()
230 sr = int(model.config.sampling_rate)
232 # Write WAV via soundfile (already a transitive of the bigger TTS
233 # engines so it's reliably present).
234 import soundfile as _sf
235 _sf.write(output_path, waveform, sr)
237 duration = round(len(waveform) / sr, 2)
239 return {
240 'path': output_path,
241 'duration': duration,
242 'sample_rate': sr,
243 'engine': 'mms-tts',
244 'device': state.device,
245 'language': req.get('language', 'en'),
246 'iso3': iso3,
247 'voice': 'default',
248 }
251# ── Parent-side: one ToolWorker instance ─────────────────────────
253_tool = ToolWorker(
254 tool_name='mms_tts',
255 tool_module='integrations.service_tools.mms_tts_tool',
256 vram_budget='tts_mms_tts',
257 output_subdir='mms_tts/output',
258 engine='mms-tts',
259 startup_timeout=120.0, # first-time per-language download is ~150 MB
260 request_timeout=90.0,
261)
264def mms_tts_synthesize(
265 text: str,
266 language: str = 'en',
267 voice: Optional[str] = None,
268 output_path: Optional[str] = None,
269) -> str:
270 """Synthesize speech using MMS-TTS (Meta's 1100+ language VITS).
272 Returns JSON. On subprocess crash or unsupported language the
273 response contains `transient: true` so the caller can fall back.
274 """
275 return _tool.synthesize(
276 text=text,
277 language=language,
278 voice=voice,
279 output_path=output_path,
280 )
283def unload_mms_tts():
284 """Stop the MMS-TTS worker subprocess and free its VRAM."""
285 _tool.stop()
288class MMSTTSTool:
289 """Register MMS-TTS as an in-process service tool."""
291 @classmethod
292 def register_functions(cls):
293 from .registry import ServiceToolInfo, service_tool_registry
294 tool_info = ServiceToolInfo(
295 name="mms_tts",
296 description=(
297 "MMS-TTS: Meta's Massively Multilingual Speech TTS. "
298 "1100+ languages via per-language VITS checkpoints, "
299 "~1 GB VRAM, no voice cloning. "
300 "Non-Roman scripts need uroman (perl or "
301 "`pip install uroman`). Uses transformers — no extra pip dep."
302 ),
303 base_url="inprocess://mms_tts",
304 endpoints={
305 "synthesize": {
306 "path": "/synthesize",
307 "method": "POST",
308 "description": "Synthesize with MMS-TTS (1100+ languages, GPU/CPU).",
309 "params_schema": {
310 "text": {"type": "string"},
311 "language": {"type": "string"},
312 },
313 },
314 },
315 tags=["tts", "speech", "multilingual", "mms", "vits"],
316 timeout=60,
317 )
318 tool_info.is_healthy = True
319 service_tool_registry._tools["mms_tts"] = tool_info
320 return True
323# NOTE: no `if __name__ == '__main__':` block — gpu_worker dispatcher
324# resolves `_load` / `_synthesize` by convention.