Coverage for integrations/service_tools/melotts

1"""

2MeloTTS tool — myshell-ai's multilingual neural TTS (CPU-friendly, GPU optional).

4VRAM: ~1.5 GB on GPU; runs at real-time on CPU too.

5Per-language model checkpoints on HuggingFace:

6 myshell-ai/MeloTTS-English (also -English-v2, -English-v3)

7 myshell-ai/MeloTTS-Spanish

8 myshell-ai/MeloTTS-French

9 myshell-ai/MeloTTS-Chinese (mixed Chinese + English)

10 myshell-ai/MeloTTS-Japanese

11 myshell-ai/MeloTTS-Korean

13Requires: pip install melotts (PyPI 'melotts' package). Ships with the

14`melo` import root, so the Python entry is `from melo.api import TTS`.

16SUBPROCESS ISOLATED: same convention as f5_tts_tool / chatterbox_tool —

17this module exposes `_load` + `_synthesize` callbacks that the gpu_worker

18dispatcher imports in a worker subprocess. CUDA OOM only kills the

19worker; parent receives `transient: true` and can fall back.

21Public API (parent side):

22 melotts_synthesize(text, language, voice, output_path) → JSON

23 unload_melotts() → None

24"""

26from typing import Optional

28import os

29import sys

31from integrations.service_tools.gpu_worker import ToolWorker

33# ── Language code → MeloTTS variant + speaker id ─────────────────

34#

35# MeloTTS exposes one HF model per language family + a fixed list of

36# speaker ids inside each. Map ISO 639-1 codes to (model_lang_arg,

37# default_speaker_key). Unknown codes raise — the router falls

38# through to the next engine in the language preference list.

39#

40# Source of truth: model card on huggingface.co/myshell-ai/MeloTTS-*

42_LANG_TO_MELO = {

43 'en': ('EN', 'EN-US'), # also EN-BR / EN_INDIA / EN-AU / EN-Default

44 'es': ('ES', 'ES'),

45 'fr': ('FR', 'FR'),

46 'zh': ('ZH', 'ZH'), # Chinese (also handles mixed EN-ZH)

47 'ja': ('JP', 'JP'),

48 'ko': ('KR', 'KR'),

49}

52def _resolve_lang(req_lang: Optional[str]) -> tuple[str, str]:

53 """Map ISO 639-1 → (MeloTTS language arg, speaker key).

55 Defaults to English on unknown codes — caller (the router) is

56 expected to filter via _LANG_CAPABLE_BACKENDS / language preference

57 before selecting MeloTTS, so this default only triggers when caller

58 set language=None.

59 """

60 if not req_lang:

61 return _LANG_TO_MELO['en']

62 code = req_lang.replace('_', '-').split('-')[0].lower()

63 return _LANG_TO_MELO.get(code, _LANG_TO_MELO['en'])

66def _load():

67 """Load the English MeloTTS model on the best available device.

69 MeloTTS loads one language at a time. We default to English at

70 spawn; subsequent calls with a different language re-instantiate

71 the TTS class for that language inside `_synthesize`. The model

72 instance is stored on the cached object as `.tts` and the active

73 language on `.lang`.

75 On CPU machines MeloTTS still runs in real-time (~1× rtf), so

76 we don't hard-fail when CUDA is missing — let torch decide.

77 """

78 from melo.api import TTS # noqa: F401 — exception bubbles to worker

80 # Pick device based on CUDA availability. Worker self-reports

81 # post-load VRAM via the __WORKER_VRAM_GB__ marker so vram_manager

82 # auto-tightens the budget if we declared too high.

83 try:

84 import torch

85 device = 'cuda' if torch.cuda.is_available() else 'cpu'

86 except Exception:

87 device = 'cpu'

89 instance = TTS(language='EN', device=device)

91 # Box state in a small object so _synthesize can swap language

92 # without losing access to the device hint or the original class.

93 class _State:

94 def __init__(self_):

95 self_.tts = instance

96 self_.lang = 'EN'

97 self_.device = device

99 return _State()

100

101

102def _synthesize(state, req: dict) -> dict:

103 text = req.get('text', '')

104 if not text or not text.strip():

105 return {'error': 'Text is required'}

106

107 output_path = req.get('output_path')

108 if not output_path:

109 return {'error': 'output_path is required'}

110

111 melo_lang, speaker_key = _resolve_lang(req.get('language', 'en'))

112

113 # Re-instantiate on language switch (one model per language).

114 if melo_lang != state.lang:

115 from melo.api import TTS

116 state.tts = TTS(language=melo_lang, device=state.device)

117 state.lang = melo_lang

118

119 speaker_ids = state.tts.hps.data.spk2id

120 # Fall back to first speaker if the requested key isn't in this

121 # model's speaker list (e.g. EN-BR not present in some EN variants).

122 spk_id = speaker_ids.get(speaker_key) if hasattr(speaker_ids, 'get') \

123 else speaker_ids[speaker_key] if speaker_key in speaker_ids \

124 else None

125 if spk_id is None:

126 spk_id = next(iter(speaker_ids.values())) if speaker_ids else 0

127

128 speed = float(req.get('speed') or 1.0)

129 state.tts.tts_to_file(

130 text=text,

131 speaker_id=spk_id,

132 output_path=output_path,

133 speed=speed,

134 )

135

136 # Best-effort duration probe — soundfile is the lightest reader.

137 try:

138 import soundfile as _sf

139 info = _sf.info(output_path)

140 duration = round(info.frames / info.samplerate, 2)

141 sr = info.samplerate

142 except Exception:

143 duration = 0.0

144 sr = 24000

145

146 return {

147 'path': output_path,

148 'duration': duration,

149 'sample_rate': sr,

150 'engine': 'melotts',

151 'device': state.device,

152 'language': req.get('language', 'en'),

153 'voice': speaker_key,

154 }

155

156

157# ── Parent-side: one ToolWorker instance ─────────────────────────

158

159_tool = ToolWorker(

160 tool_name='melotts',

161 tool_module='integrations.service_tools.melotts_tool',

162 vram_budget='tts_melotts',

163 output_subdir='melotts/output',

164 engine='melotts',

165 startup_timeout=90.0,

166 request_timeout=90.0,

167)

168

169

170def melotts_synthesize(

171 text: str,

172 language: str = 'en',

173 voice: Optional[str] = None,

174 output_path: Optional[str] = None,

175 speed: float = 1.0,

176) -> str:

177 """Synthesize speech using MeloTTS (multilingual neural TTS).

178

179 Returns a JSON string compatible with the rest of the TTS tools.

180 On subprocess crash the response contains `transient: true` so

181 the caller (Nunba TTSEngine / HARTOS tts_router) can fall back.

182 """

183 return _tool.synthesize(

184 text=text,

185 language=language,

186 voice=voice,

187 output_path=output_path,

188 extra_request={'speed': speed} if speed != 1.0 else None,

189 )

190

191

192def unload_melotts():

193 """Stop the MeloTTS worker subprocess and release VRAM."""

194 _tool.stop()

195

196

197class MeloTTSTool:

198 """Register MeloTTS as an in-process service tool."""

199

200 @classmethod

201 def register_functions(cls):

202 from .registry import ServiceToolInfo, service_tool_registry

203 tool_info = ServiceToolInfo(

204 name="melotts",

205 description=(

206 "MeloTTS: myshell-ai multilingual neural TTS. "

207 "6 languages (en/es/fr/zh/ja/ko), ~1.5GB VRAM, runs on "

208 "CPU at real-time too. Multiple English accents "

209 "(US/BR/IN/AU). No voice cloning. "

210 "Requires: pip install melotts"

211 ),

212 base_url="inprocess://melotts",

213 endpoints={

214 "synthesize": {

215 "path": "/synthesize",

216 "method": "POST",

217 "description": "Synthesize with MeloTTS (multilingual, GPU/CPU).",

218 "params_schema": {

219 "text": {"type": "string"},

220 "language": {"type": "string"},

221 "voice": {"type": "string", "description": "Speaker id (EN-US, EN-BR, ...)"},

222 },

223 },

224 },

225 tags=["tts", "speech", "multilingual", "neural", "melotts"],

226 timeout=60,

227 )

228 tool_info.is_healthy = True

229 service_tool_registry._tools["melotts"] = tool_info

230 return True

231

232

233# NOTE: no `if __name__ == '__main__':` block — the gpu_worker

234# dispatcher picks up `_load` / `_synthesize` when invoked via

235# `python -m integrations.service_tools.gpu_worker

236# integrations.service_tools.melotts_tool`.

Coverage for integrations / service_tools / melotts_tool.py: 0.0%

59 statements