Coverage for integrations / service_tools / melotts_tool.py: 0.0%

59 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2MeloTTS tool — myshell-ai's multilingual neural TTS (CPU-friendly, GPU optional). 

3 

4VRAM: ~1.5 GB on GPU; runs at real-time on CPU too. 

5Per-language model checkpoints on HuggingFace: 

6 myshell-ai/MeloTTS-English (also -English-v2, -English-v3) 

7 myshell-ai/MeloTTS-Spanish 

8 myshell-ai/MeloTTS-French 

9 myshell-ai/MeloTTS-Chinese (mixed Chinese + English) 

10 myshell-ai/MeloTTS-Japanese 

11 myshell-ai/MeloTTS-Korean 

12 

13Requires: pip install melotts (PyPI 'melotts' package). Ships with the 

14`melo` import root, so the Python entry is `from melo.api import TTS`. 

15 

16SUBPROCESS ISOLATED: same convention as f5_tts_tool / chatterbox_tool — 

17this module exposes `_load` + `_synthesize` callbacks that the gpu_worker 

18dispatcher imports in a worker subprocess. CUDA OOM only kills the 

19worker; parent receives `transient: true` and can fall back. 

20 

21Public API (parent side): 

22 melotts_synthesize(text, language, voice, output_path) → JSON 

23 unload_melotts() → None 

24""" 

25 

26from typing import Optional 

27 

28import os 

29import sys 

30 

31from integrations.service_tools.gpu_worker import ToolWorker 

32 

33# ── Language code → MeloTTS variant + speaker id ───────────────── 

34# 

35# MeloTTS exposes one HF model per language family + a fixed list of 

36# speaker ids inside each. Map ISO 639-1 codes to (model_lang_arg, 

37# default_speaker_key). Unknown codes raise — the router falls 

38# through to the next engine in the language preference list. 

39# 

40# Source of truth: model card on huggingface.co/myshell-ai/MeloTTS-* 

41 

42_LANG_TO_MELO = { 

43 'en': ('EN', 'EN-US'), # also EN-BR / EN_INDIA / EN-AU / EN-Default 

44 'es': ('ES', 'ES'), 

45 'fr': ('FR', 'FR'), 

46 'zh': ('ZH', 'ZH'), # Chinese (also handles mixed EN-ZH) 

47 'ja': ('JP', 'JP'), 

48 'ko': ('KR', 'KR'), 

49} 

50 

51 

52def _resolve_lang(req_lang: Optional[str]) -> tuple[str, str]: 

53 """Map ISO 639-1 → (MeloTTS language arg, speaker key). 

54 

55 Defaults to English on unknown codes — caller (the router) is 

56 expected to filter via _LANG_CAPABLE_BACKENDS / language preference 

57 before selecting MeloTTS, so this default only triggers when caller 

58 set language=None. 

59 """ 

60 if not req_lang: 

61 return _LANG_TO_MELO['en'] 

62 code = req_lang.replace('_', '-').split('-')[0].lower() 

63 return _LANG_TO_MELO.get(code, _LANG_TO_MELO['en']) 

64 

65 

66def _load(): 

67 """Load the English MeloTTS model on the best available device. 

68 

69 MeloTTS loads one language at a time. We default to English at 

70 spawn; subsequent calls with a different language re-instantiate 

71 the TTS class for that language inside `_synthesize`. The model 

72 instance is stored on the cached object as `.tts` and the active 

73 language on `.lang`. 

74 

75 On CPU machines MeloTTS still runs in real-time (~1× rtf), so 

76 we don't hard-fail when CUDA is missing — let torch decide. 

77 """ 

78 from melo.api import TTS # noqa: F401 — exception bubbles to worker 

79 

80 # Pick device based on CUDA availability. Worker self-reports 

81 # post-load VRAM via the __WORKER_VRAM_GB__ marker so vram_manager 

82 # auto-tightens the budget if we declared too high. 

83 try: 

84 import torch 

85 device = 'cuda' if torch.cuda.is_available() else 'cpu' 

86 except Exception: 

87 device = 'cpu' 

88 

89 instance = TTS(language='EN', device=device) 

90 

91 # Box state in a small object so _synthesize can swap language 

92 # without losing access to the device hint or the original class. 

93 class _State: 

94 def __init__(self_): 

95 self_.tts = instance 

96 self_.lang = 'EN' 

97 self_.device = device 

98 

99 return _State() 

100 

101 

102def _synthesize(state, req: dict) -> dict: 

103 text = req.get('text', '') 

104 if not text or not text.strip(): 

105 return {'error': 'Text is required'} 

106 

107 output_path = req.get('output_path') 

108 if not output_path: 

109 return {'error': 'output_path is required'} 

110 

111 melo_lang, speaker_key = _resolve_lang(req.get('language', 'en')) 

112 

113 # Re-instantiate on language switch (one model per language). 

114 if melo_lang != state.lang: 

115 from melo.api import TTS 

116 state.tts = TTS(language=melo_lang, device=state.device) 

117 state.lang = melo_lang 

118 

119 speaker_ids = state.tts.hps.data.spk2id 

120 # Fall back to first speaker if the requested key isn't in this 

121 # model's speaker list (e.g. EN-BR not present in some EN variants). 

122 spk_id = speaker_ids.get(speaker_key) if hasattr(speaker_ids, 'get') \ 

123 else speaker_ids[speaker_key] if speaker_key in speaker_ids \ 

124 else None 

125 if spk_id is None: 

126 spk_id = next(iter(speaker_ids.values())) if speaker_ids else 0 

127 

128 speed = float(req.get('speed') or 1.0) 

129 state.tts.tts_to_file( 

130 text=text, 

131 speaker_id=spk_id, 

132 output_path=output_path, 

133 speed=speed, 

134 ) 

135 

136 # Best-effort duration probe — soundfile is the lightest reader. 

137 try: 

138 import soundfile as _sf 

139 info = _sf.info(output_path) 

140 duration = round(info.frames / info.samplerate, 2) 

141 sr = info.samplerate 

142 except Exception: 

143 duration = 0.0 

144 sr = 24000 

145 

146 return { 

147 'path': output_path, 

148 'duration': duration, 

149 'sample_rate': sr, 

150 'engine': 'melotts', 

151 'device': state.device, 

152 'language': req.get('language', 'en'), 

153 'voice': speaker_key, 

154 } 

155 

156 

157# ── Parent-side: one ToolWorker instance ───────────────────────── 

158 

159_tool = ToolWorker( 

160 tool_name='melotts', 

161 tool_module='integrations.service_tools.melotts_tool', 

162 vram_budget='tts_melotts', 

163 output_subdir='melotts/output', 

164 engine='melotts', 

165 startup_timeout=90.0, 

166 request_timeout=90.0, 

167) 

168 

169 

170def melotts_synthesize( 

171 text: str, 

172 language: str = 'en', 

173 voice: Optional[str] = None, 

174 output_path: Optional[str] = None, 

175 speed: float = 1.0, 

176) -> str: 

177 """Synthesize speech using MeloTTS (multilingual neural TTS). 

178 

179 Returns a JSON string compatible with the rest of the TTS tools. 

180 On subprocess crash the response contains `transient: true` so 

181 the caller (Nunba TTSEngine / HARTOS tts_router) can fall back. 

182 """ 

183 return _tool.synthesize( 

184 text=text, 

185 language=language, 

186 voice=voice, 

187 output_path=output_path, 

188 extra_request={'speed': speed} if speed != 1.0 else None, 

189 ) 

190 

191 

192def unload_melotts(): 

193 """Stop the MeloTTS worker subprocess and release VRAM.""" 

194 _tool.stop() 

195 

196 

197class MeloTTSTool: 

198 """Register MeloTTS as an in-process service tool.""" 

199 

200 @classmethod 

201 def register_functions(cls): 

202 from .registry import ServiceToolInfo, service_tool_registry 

203 tool_info = ServiceToolInfo( 

204 name="melotts", 

205 description=( 

206 "MeloTTS: myshell-ai multilingual neural TTS. " 

207 "6 languages (en/es/fr/zh/ja/ko), ~1.5GB VRAM, runs on " 

208 "CPU at real-time too. Multiple English accents " 

209 "(US/BR/IN/AU). No voice cloning. " 

210 "Requires: pip install melotts" 

211 ), 

212 base_url="inprocess://melotts", 

213 endpoints={ 

214 "synthesize": { 

215 "path": "/synthesize", 

216 "method": "POST", 

217 "description": "Synthesize with MeloTTS (multilingual, GPU/CPU).", 

218 "params_schema": { 

219 "text": {"type": "string"}, 

220 "language": {"type": "string"}, 

221 "voice": {"type": "string", "description": "Speaker id (EN-US, EN-BR, ...)"}, 

222 }, 

223 }, 

224 }, 

225 tags=["tts", "speech", "multilingual", "neural", "melotts"], 

226 timeout=60, 

227 ) 

228 tool_info.is_healthy = True 

229 service_tool_registry._tools["melotts"] = tool_info 

230 return True 

231 

232 

233# NOTE: no `if __name__ == '__main__':` block — the gpu_worker 

234# dispatcher picks up `_load` / `_synthesize` when invoked via 

235# `python -m integrations.service_tools.gpu_worker 

236# integrations.service_tools.melotts_tool`.