Coverage for integrations/service_tools/mms_tts

1"""

2MMS-TTS tool — Meta's Massively Multilingual Speech TTS (1100+ languages).

4VRAM: ~1.0 GB on GPU; runs comfortably on CPU too.

5Architecture: VITS (the same flow-based model VITS-MMS papers describe).

6HF: facebook/mms-tts-<iso639-3> (per-language checkpoint, ~150 MB each).

8Requires: only `transformers` (already bundled in Nunba's python-embed

9and HARTOS's main deps). No new pip dep on Linux/macOS.

11For non-Roman script languages (Arabic, Hindi, Mandarin, Korean, ...)

12the upstream VitsTokenizer flags `is_uroman=True` and expects pre-

13romanized input via the `uroman` perl package. This tool detects

14that flag and:

15 - if the optional `uroman` Python wrapper (or the `UROMAN` env var

16 pointing at the perl repo) is present → romanizes automatically;

17 - else returns `{'error': ..., 'transient': true}` so the router

18 falls through to the next engine in the language preference list.

20This keeps MMS as the universal-coverage fallback without breaking

21when uroman isn't installed — the language preference order picks up

22Indic Parler / XTTS / MeloTTS first when they're available.

24SUBPROCESS ISOLATED: same convention as f5_tts_tool / chatterbox_tool.

26Public API (parent side):

27 mms_tts_synthesize(text, language, voice, output_path) → JSON

28 unload_mms_tts() → None

29"""

31from typing import Optional

33import os

34import sys

36from integrations.service_tools.gpu_worker import ToolWorker

38# ── ISO 639-1 → ISO 639-3 mapping for MMS-TTS repos ──────────────

39#

40# MMS-TTS uses 3-letter ISO 639-3 codes (eng / fra / hin / cmn / ...).

41# Nunba and HARTOS speak ISO 639-1 (en / fr / hi / zh). This map is

42# the SINGLE bridge between the two — every language in

43# core.constants.SUPPORTED_LANG_DICT that has a known mms-tts-<iso3>

44# repo is listed below. Codes deliberately NOT mapped here either

45# (a) don't have a HuggingFace mms-tts checkpoint, or (b) use a

46# different ISO3 than the obvious 1↔3 collation and need verification

47# before we route real users through them.

48#

49# Source: facebook/mms-tts model collection on HuggingFace.

51ISO1_TO_ISO3 = {

52 # Major European

53 'en': 'eng', 'es': 'spa', 'fr': 'fra', 'de': 'deu', 'it': 'ita',

54 'pt': 'por', 'nl': 'nld', 'pl': 'pol', 'tr': 'tur', 'ru': 'rus',

55 'cs': 'ces', 'hu': 'hun', 'sv': 'swe', 'fi': 'fin', 'el': 'ell',

56 'ro': 'ron', 'bg': 'bul', 'uk': 'ukr', 'cy': 'cym', 'is': 'isl',

57 # CJK + SEA

58 'zh': 'cmn', 'ja': 'jpn', 'ko': 'kor', 'vi': 'vie', 'th': 'tha',

59 'id': 'ind', 'ms': 'zlm', 'km': 'khm', 'lo': 'lao', 'my': 'mya',

60 # Indic (subset that has explicit mms-tts checkpoints)

61 'hi': 'hin', 'bn': 'ben', 'ta': 'tam', 'te': 'tel', 'mr': 'mar',

62 'gu': 'guj', 'kn': 'kan', 'ml': 'mal', 'pa': 'pan', 'or': 'ory',

63 'ne': 'nep', 'as': 'asm', 'sd': 'snd', 'sa': 'san', 'ur': 'urd',

64 'si': 'sin',

65 # Middle East / Africa

66 'ar': 'ara', 'fa': 'pes', 'he': 'heb', 'sw': 'swh',

67}

70def _iso1_to_iso3(req_lang: Optional[str]) -> Optional[str]:

71 """Return the ISO 639-3 code for a 2-letter language, or None.

73 None means "MMS doesn't have a verified checkpoint for this lang

74 in our mapping" — caller should treat that as 'this engine cannot

75 serve this language' and fall through to the next preference.

76 """

77 if not req_lang:

78 return ISO1_TO_ISO3.get('en')

79 code = req_lang.replace('_', '-').split('-')[0].lower()

80 return ISO1_TO_ISO3.get(code)

83def _try_uromanize(text: str) -> Optional[str]:

84 """Best-effort romanization for non-Roman script input.

86 Returns the romanized string on success, None if uroman is not

87 available in any supported form. The caller treats None as a

88 hard failure for the current request.

90 Order of attempts:

91 1. The `uroman` Python wrapper (`pip install uroman`) — pure

92 Python, no perl required. Modern, easiest path.

93 2. The `UROMAN` env var pointing at the isi-nlp/uroman perl

94 repo (the canonical upstream path documented by HF).

95 """

96 # Pure-Python wrapper first

97 try:

98 import uroman as _uroman_pkg # type: ignore

99 u = _uroman_pkg.Uroman()

100 return u.romanize_string(text)

101 except Exception:

102 pass

103

104 # Perl repo via UROMAN env var

105 uroman_root = os.environ.get('UROMAN')

106 if uroman_root and os.path.isdir(uroman_root):

107 script = os.path.join(uroman_root, 'bin', 'uroman.pl')

108 if os.path.isfile(script):

109 try:

110 import subprocess

111 proc = subprocess.run(

112 ['perl', script],

113 input=text.encode('utf-8'),

114 capture_output=True,

115 timeout=15,

116 )

117 if proc.returncode == 0:

118 out = proc.stdout.decode('utf-8', errors='replace')

119 return out.rstrip('\n')

120 except Exception:

121 pass

122

123 return None

124

125

126def _load():

127 """Load the default English MMS-TTS checkpoint on the best device.

128

129 The model+tokenizer pair is per-language, so we cache them in a

130 dict keyed by ISO 639-3 code and lazily load on first request for

131 each language. On English the load is a no-op since `_State`

132 already initialized it.

133 """

134 from transformers import VitsTokenizer, VitsModel

135

136 try:

137 import torch

138 device = 'cuda' if torch.cuda.is_available() else 'cpu'

139 except Exception:

140 device = 'cpu'

141

142 repo = 'facebook/mms-tts-eng'

143 tokenizer = VitsTokenizer.from_pretrained(repo)

144 model = VitsModel.from_pretrained(repo)

145 if device == 'cuda':

146 try:

147 model = model.to('cuda')

148 except Exception:

149 device = 'cpu'

150

151 class _State:

152 def __init__(self_):

153 self_.device = device

154 # iso3 → (tokenizer, model)

155 self_.cache = {'eng': (tokenizer, model)}

156

157 return _State()

158

159

160def _synthesize(state, req: dict) -> dict:

161 text = req.get('text', '')

162 if not text or not text.strip():

163 return {'error': 'Text is required'}

164

165 output_path = req.get('output_path')

166 if not output_path:

167 return {'error': 'output_path is required'}

168

169 iso3 = _iso1_to_iso3(req.get('language', 'en'))

170 if not iso3:

171 return {

172 'error': (

173 f"MMS-TTS has no mapped checkpoint for language "

174 f"'{req.get('language')}'"

175 ),

176 'transient': True,

177 }

178

179 # Lazy-load the per-language model

180 if iso3 not in state.cache:

181 from transformers import VitsTokenizer, VitsModel

182 repo = f'facebook/mms-tts-{iso3}'

183 try:

184 tokenizer = VitsTokenizer.from_pretrained(repo)

185 model = VitsModel.from_pretrained(repo)

186 if state.device == 'cuda':

187 try:

188 model = model.to('cuda')

189 except Exception:

190 pass

191 state.cache[iso3] = (tokenizer, model)

192 except Exception as e:

193 return {

194 'error': f'mms-tts-{iso3} load failed: {e}',

195 'transient': True,

196 }

197

198 tokenizer, model = state.cache[iso3]

199

200 # Romanize input text on demand for non-Roman script languages.

201 # The VitsTokenizer.is_uroman flag tells us whether the model was

202 # trained on romanized text. If True and the input contains

203 # non-ASCII, route through uroman first.

204 if getattr(tokenizer, 'is_uroman', False):

205 is_ascii = all(ord(c) < 128 for c in text)

206 if not is_ascii:

207 roman = _try_uromanize(text)

208 if roman is None:

209 return {

210 'error': (

211 f"mms-tts-{iso3} requires uroman for non-Roman "

212 f"input; install `pip install uroman` or set "

213 f"UROMAN env var to the isi-nlp/uroman repo path"

214 ),

215 'transient': True,

216 }

217 text = roman

218

219 inputs = tokenizer(text=text, return_tensors='pt')

220 if state.device == 'cuda':

221 try:

222 inputs = {k: v.to('cuda') for k, v in inputs.items()}

223 except Exception:

224 pass

225

226 import torch

227 with torch.no_grad():

228 outputs = model(**inputs)

229 waveform = outputs.waveform[0].detach().cpu().numpy()

230 sr = int(model.config.sampling_rate)

231

232 # Write WAV via soundfile (already a transitive of the bigger TTS

233 # engines so it's reliably present).

234 import soundfile as _sf

235 _sf.write(output_path, waveform, sr)

236

237 duration = round(len(waveform) / sr, 2)

238

239 return {

240 'path': output_path,

241 'duration': duration,

242 'sample_rate': sr,

243 'engine': 'mms-tts',

244 'device': state.device,

245 'language': req.get('language', 'en'),

246 'iso3': iso3,

247 'voice': 'default',

248 }

249

250

251# ── Parent-side: one ToolWorker instance ─────────────────────────

252

253_tool = ToolWorker(

254 tool_name='mms_tts',

255 tool_module='integrations.service_tools.mms_tts_tool',

256 vram_budget='tts_mms_tts',

257 output_subdir='mms_tts/output',

258 engine='mms-tts',

259 startup_timeout=120.0, # first-time per-language download is ~150 MB

260 request_timeout=90.0,

261)

262

263

264def mms_tts_synthesize(

265 text: str,

266 language: str = 'en',

267 voice: Optional[str] = None,

268 output_path: Optional[str] = None,

269) -> str:

270 """Synthesize speech using MMS-TTS (Meta's 1100+ language VITS).

271

272 Returns JSON. On subprocess crash or unsupported language the

273 response contains `transient: true` so the caller can fall back.

274 """

275 return _tool.synthesize(

276 text=text,

277 language=language,

278 voice=voice,

279 output_path=output_path,

280 )

281

282

283def unload_mms_tts():

284 """Stop the MMS-TTS worker subprocess and free its VRAM."""

285 _tool.stop()

286

287

288class MMSTTSTool:

289 """Register MMS-TTS as an in-process service tool."""

290

291 @classmethod

292 def register_functions(cls):

293 from .registry import ServiceToolInfo, service_tool_registry

294 tool_info = ServiceToolInfo(

295 name="mms_tts",

296 description=(

297 "MMS-TTS: Meta's Massively Multilingual Speech TTS. "

298 "1100+ languages via per-language VITS checkpoints, "

299 "~1 GB VRAM, no voice cloning. "

300 "Non-Roman scripts need uroman (perl or "

301 "`pip install uroman`). Uses transformers — no extra pip dep."

302 ),

303 base_url="inprocess://mms_tts",

304 endpoints={

305 "synthesize": {

306 "path": "/synthesize",

307 "method": "POST",

308 "description": "Synthesize with MMS-TTS (1100+ languages, GPU/CPU).",

309 "params_schema": {

310 "text": {"type": "string"},

311 "language": {"type": "string"},

312 },

313 },

314 },

315 tags=["tts", "speech", "multilingual", "mms", "vits"],

316 timeout=60,

317 )

318 tool_info.is_healthy = True

319 service_tool_registry._tools["mms_tts"] = tool_info

320 return True

321

322

323# NOTE: no `if __name__ == '__main__':` block — gpu_worker dispatcher

324# resolves `_load` / `_synthesize` by convention.

Coverage for integrations / service_tools / mms_tts_tool.py: 0.0%

96 statements