Coverage for integrations / service_tools / mms_tts_tool.py: 0.0%

96 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2MMS-TTS tool — Meta's Massively Multilingual Speech TTS (1100+ languages). 

3 

4VRAM: ~1.0 GB on GPU; runs comfortably on CPU too. 

5Architecture: VITS (the same flow-based model VITS-MMS papers describe). 

6HF: facebook/mms-tts-<iso639-3> (per-language checkpoint, ~150 MB each). 

7 

8Requires: only `transformers` (already bundled in Nunba's python-embed 

9and HARTOS's main deps). No new pip dep on Linux/macOS. 

10 

11For non-Roman script languages (Arabic, Hindi, Mandarin, Korean, ...) 

12the upstream VitsTokenizer flags `is_uroman=True` and expects pre- 

13romanized input via the `uroman` perl package. This tool detects 

14that flag and: 

15 - if the optional `uroman` Python wrapper (or the `UROMAN` env var 

16 pointing at the perl repo) is present → romanizes automatically; 

17 - else returns `{'error': ..., 'transient': true}` so the router 

18 falls through to the next engine in the language preference list. 

19 

20This keeps MMS as the universal-coverage fallback without breaking 

21when uroman isn't installed — the language preference order picks up 

22Indic Parler / XTTS / MeloTTS first when they're available. 

23 

24SUBPROCESS ISOLATED: same convention as f5_tts_tool / chatterbox_tool. 

25 

26Public API (parent side): 

27 mms_tts_synthesize(text, language, voice, output_path) → JSON 

28 unload_mms_tts() → None 

29""" 

30 

31from typing import Optional 

32 

33import os 

34import sys 

35 

36from integrations.service_tools.gpu_worker import ToolWorker 

37 

38# ── ISO 639-1 → ISO 639-3 mapping for MMS-TTS repos ────────────── 

39# 

40# MMS-TTS uses 3-letter ISO 639-3 codes (eng / fra / hin / cmn / ...). 

41# Nunba and HARTOS speak ISO 639-1 (en / fr / hi / zh). This map is 

42# the SINGLE bridge between the two — every language in 

43# core.constants.SUPPORTED_LANG_DICT that has a known mms-tts-<iso3> 

44# repo is listed below. Codes deliberately NOT mapped here either 

45# (a) don't have a HuggingFace mms-tts checkpoint, or (b) use a 

46# different ISO3 than the obvious 1↔3 collation and need verification 

47# before we route real users through them. 

48# 

49# Source: facebook/mms-tts model collection on HuggingFace. 

50 

51ISO1_TO_ISO3 = { 

52 # Major European 

53 'en': 'eng', 'es': 'spa', 'fr': 'fra', 'de': 'deu', 'it': 'ita', 

54 'pt': 'por', 'nl': 'nld', 'pl': 'pol', 'tr': 'tur', 'ru': 'rus', 

55 'cs': 'ces', 'hu': 'hun', 'sv': 'swe', 'fi': 'fin', 'el': 'ell', 

56 'ro': 'ron', 'bg': 'bul', 'uk': 'ukr', 'cy': 'cym', 'is': 'isl', 

57 # CJK + SEA 

58 'zh': 'cmn', 'ja': 'jpn', 'ko': 'kor', 'vi': 'vie', 'th': 'tha', 

59 'id': 'ind', 'ms': 'zlm', 'km': 'khm', 'lo': 'lao', 'my': 'mya', 

60 # Indic (subset that has explicit mms-tts checkpoints) 

61 'hi': 'hin', 'bn': 'ben', 'ta': 'tam', 'te': 'tel', 'mr': 'mar', 

62 'gu': 'guj', 'kn': 'kan', 'ml': 'mal', 'pa': 'pan', 'or': 'ory', 

63 'ne': 'nep', 'as': 'asm', 'sd': 'snd', 'sa': 'san', 'ur': 'urd', 

64 'si': 'sin', 

65 # Middle East / Africa 

66 'ar': 'ara', 'fa': 'pes', 'he': 'heb', 'sw': 'swh', 

67} 

68 

69 

70def _iso1_to_iso3(req_lang: Optional[str]) -> Optional[str]: 

71 """Return the ISO 639-3 code for a 2-letter language, or None. 

72 

73 None means "MMS doesn't have a verified checkpoint for this lang 

74 in our mapping" — caller should treat that as 'this engine cannot 

75 serve this language' and fall through to the next preference. 

76 """ 

77 if not req_lang: 

78 return ISO1_TO_ISO3.get('en') 

79 code = req_lang.replace('_', '-').split('-')[0].lower() 

80 return ISO1_TO_ISO3.get(code) 

81 

82 

83def _try_uromanize(text: str) -> Optional[str]: 

84 """Best-effort romanization for non-Roman script input. 

85 

86 Returns the romanized string on success, None if uroman is not 

87 available in any supported form. The caller treats None as a 

88 hard failure for the current request. 

89 

90 Order of attempts: 

91 1. The `uroman` Python wrapper (`pip install uroman`) — pure 

92 Python, no perl required. Modern, easiest path. 

93 2. The `UROMAN` env var pointing at the isi-nlp/uroman perl 

94 repo (the canonical upstream path documented by HF). 

95 """ 

96 # Pure-Python wrapper first 

97 try: 

98 import uroman as _uroman_pkg # type: ignore 

99 u = _uroman_pkg.Uroman() 

100 return u.romanize_string(text) 

101 except Exception: 

102 pass 

103 

104 # Perl repo via UROMAN env var 

105 uroman_root = os.environ.get('UROMAN') 

106 if uroman_root and os.path.isdir(uroman_root): 

107 script = os.path.join(uroman_root, 'bin', 'uroman.pl') 

108 if os.path.isfile(script): 

109 try: 

110 import subprocess 

111 proc = subprocess.run( 

112 ['perl', script], 

113 input=text.encode('utf-8'), 

114 capture_output=True, 

115 timeout=15, 

116 ) 

117 if proc.returncode == 0: 

118 out = proc.stdout.decode('utf-8', errors='replace') 

119 return out.rstrip('\n') 

120 except Exception: 

121 pass 

122 

123 return None 

124 

125 

126def _load(): 

127 """Load the default English MMS-TTS checkpoint on the best device. 

128 

129 The model+tokenizer pair is per-language, so we cache them in a 

130 dict keyed by ISO 639-3 code and lazily load on first request for 

131 each language. On English the load is a no-op since `_State` 

132 already initialized it. 

133 """ 

134 from transformers import VitsTokenizer, VitsModel 

135 

136 try: 

137 import torch 

138 device = 'cuda' if torch.cuda.is_available() else 'cpu' 

139 except Exception: 

140 device = 'cpu' 

141 

142 repo = 'facebook/mms-tts-eng' 

143 tokenizer = VitsTokenizer.from_pretrained(repo) 

144 model = VitsModel.from_pretrained(repo) 

145 if device == 'cuda': 

146 try: 

147 model = model.to('cuda') 

148 except Exception: 

149 device = 'cpu' 

150 

151 class _State: 

152 def __init__(self_): 

153 self_.device = device 

154 # iso3 → (tokenizer, model) 

155 self_.cache = {'eng': (tokenizer, model)} 

156 

157 return _State() 

158 

159 

160def _synthesize(state, req: dict) -> dict: 

161 text = req.get('text', '') 

162 if not text or not text.strip(): 

163 return {'error': 'Text is required'} 

164 

165 output_path = req.get('output_path') 

166 if not output_path: 

167 return {'error': 'output_path is required'} 

168 

169 iso3 = _iso1_to_iso3(req.get('language', 'en')) 

170 if not iso3: 

171 return { 

172 'error': ( 

173 f"MMS-TTS has no mapped checkpoint for language " 

174 f"'{req.get('language')}'" 

175 ), 

176 'transient': True, 

177 } 

178 

179 # Lazy-load the per-language model 

180 if iso3 not in state.cache: 

181 from transformers import VitsTokenizer, VitsModel 

182 repo = f'facebook/mms-tts-{iso3}' 

183 try: 

184 tokenizer = VitsTokenizer.from_pretrained(repo) 

185 model = VitsModel.from_pretrained(repo) 

186 if state.device == 'cuda': 

187 try: 

188 model = model.to('cuda') 

189 except Exception: 

190 pass 

191 state.cache[iso3] = (tokenizer, model) 

192 except Exception as e: 

193 return { 

194 'error': f'mms-tts-{iso3} load failed: {e}', 

195 'transient': True, 

196 } 

197 

198 tokenizer, model = state.cache[iso3] 

199 

200 # Romanize input text on demand for non-Roman script languages. 

201 # The VitsTokenizer.is_uroman flag tells us whether the model was 

202 # trained on romanized text. If True and the input contains 

203 # non-ASCII, route through uroman first. 

204 if getattr(tokenizer, 'is_uroman', False): 

205 is_ascii = all(ord(c) < 128 for c in text) 

206 if not is_ascii: 

207 roman = _try_uromanize(text) 

208 if roman is None: 

209 return { 

210 'error': ( 

211 f"mms-tts-{iso3} requires uroman for non-Roman " 

212 f"input; install `pip install uroman` or set " 

213 f"UROMAN env var to the isi-nlp/uroman repo path" 

214 ), 

215 'transient': True, 

216 } 

217 text = roman 

218 

219 inputs = tokenizer(text=text, return_tensors='pt') 

220 if state.device == 'cuda': 

221 try: 

222 inputs = {k: v.to('cuda') for k, v in inputs.items()} 

223 except Exception: 

224 pass 

225 

226 import torch 

227 with torch.no_grad(): 

228 outputs = model(**inputs) 

229 waveform = outputs.waveform[0].detach().cpu().numpy() 

230 sr = int(model.config.sampling_rate) 

231 

232 # Write WAV via soundfile (already a transitive of the bigger TTS 

233 # engines so it's reliably present). 

234 import soundfile as _sf 

235 _sf.write(output_path, waveform, sr) 

236 

237 duration = round(len(waveform) / sr, 2) 

238 

239 return { 

240 'path': output_path, 

241 'duration': duration, 

242 'sample_rate': sr, 

243 'engine': 'mms-tts', 

244 'device': state.device, 

245 'language': req.get('language', 'en'), 

246 'iso3': iso3, 

247 'voice': 'default', 

248 } 

249 

250 

251# ── Parent-side: one ToolWorker instance ───────────────────────── 

252 

253_tool = ToolWorker( 

254 tool_name='mms_tts', 

255 tool_module='integrations.service_tools.mms_tts_tool', 

256 vram_budget='tts_mms_tts', 

257 output_subdir='mms_tts/output', 

258 engine='mms-tts', 

259 startup_timeout=120.0, # first-time per-language download is ~150 MB 

260 request_timeout=90.0, 

261) 

262 

263 

264def mms_tts_synthesize( 

265 text: str, 

266 language: str = 'en', 

267 voice: Optional[str] = None, 

268 output_path: Optional[str] = None, 

269) -> str: 

270 """Synthesize speech using MMS-TTS (Meta's 1100+ language VITS). 

271 

272 Returns JSON. On subprocess crash or unsupported language the 

273 response contains `transient: true` so the caller can fall back. 

274 """ 

275 return _tool.synthesize( 

276 text=text, 

277 language=language, 

278 voice=voice, 

279 output_path=output_path, 

280 ) 

281 

282 

283def unload_mms_tts(): 

284 """Stop the MMS-TTS worker subprocess and free its VRAM.""" 

285 _tool.stop() 

286 

287 

288class MMSTTSTool: 

289 """Register MMS-TTS as an in-process service tool.""" 

290 

291 @classmethod 

292 def register_functions(cls): 

293 from .registry import ServiceToolInfo, service_tool_registry 

294 tool_info = ServiceToolInfo( 

295 name="mms_tts", 

296 description=( 

297 "MMS-TTS: Meta's Massively Multilingual Speech TTS. " 

298 "1100+ languages via per-language VITS checkpoints, " 

299 "~1 GB VRAM, no voice cloning. " 

300 "Non-Roman scripts need uroman (perl or " 

301 "`pip install uroman`). Uses transformers — no extra pip dep." 

302 ), 

303 base_url="inprocess://mms_tts", 

304 endpoints={ 

305 "synthesize": { 

306 "path": "/synthesize", 

307 "method": "POST", 

308 "description": "Synthesize with MMS-TTS (1100+ languages, GPU/CPU).", 

309 "params_schema": { 

310 "text": {"type": "string"}, 

311 "language": {"type": "string"}, 

312 }, 

313 }, 

314 }, 

315 tags=["tts", "speech", "multilingual", "mms", "vits"], 

316 timeout=60, 

317 ) 

318 tool_info.is_healthy = True 

319 service_tool_registry._tools["mms_tts"] = tool_info 

320 return True 

321 

322 

323# NOTE: no `if __name__ == '__main__':` block — gpu_worker dispatcher 

324# resolves `_load` / `_synthesize` by convention.