Coverage for integrations / service_tools / melotts_tool.py: 0.0%
59 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2MeloTTS tool — myshell-ai's multilingual neural TTS (CPU-friendly, GPU optional).
4VRAM: ~1.5 GB on GPU; runs at real-time on CPU too.
5Per-language model checkpoints on HuggingFace:
6 myshell-ai/MeloTTS-English (also -English-v2, -English-v3)
7 myshell-ai/MeloTTS-Spanish
8 myshell-ai/MeloTTS-French
9 myshell-ai/MeloTTS-Chinese (mixed Chinese + English)
10 myshell-ai/MeloTTS-Japanese
11 myshell-ai/MeloTTS-Korean
13Requires: pip install melotts (PyPI 'melotts' package). Ships with the
14`melo` import root, so the Python entry is `from melo.api import TTS`.
16SUBPROCESS ISOLATED: same convention as f5_tts_tool / chatterbox_tool —
17this module exposes `_load` + `_synthesize` callbacks that the gpu_worker
18dispatcher imports in a worker subprocess. CUDA OOM only kills the
19worker; parent receives `transient: true` and can fall back.
21Public API (parent side):
22 melotts_synthesize(text, language, voice, output_path) → JSON
23 unload_melotts() → None
24"""
26from typing import Optional
28import os
29import sys
31from integrations.service_tools.gpu_worker import ToolWorker
33# ── Language code → MeloTTS variant + speaker id ─────────────────
34#
35# MeloTTS exposes one HF model per language family + a fixed list of
36# speaker ids inside each. Map ISO 639-1 codes to (model_lang_arg,
37# default_speaker_key). Unknown codes raise — the router falls
38# through to the next engine in the language preference list.
39#
40# Source of truth: model card on huggingface.co/myshell-ai/MeloTTS-*
42_LANG_TO_MELO = {
43 'en': ('EN', 'EN-US'), # also EN-BR / EN_INDIA / EN-AU / EN-Default
44 'es': ('ES', 'ES'),
45 'fr': ('FR', 'FR'),
46 'zh': ('ZH', 'ZH'), # Chinese (also handles mixed EN-ZH)
47 'ja': ('JP', 'JP'),
48 'ko': ('KR', 'KR'),
49}
52def _resolve_lang(req_lang: Optional[str]) -> tuple[str, str]:
53 """Map ISO 639-1 → (MeloTTS language arg, speaker key).
55 Defaults to English on unknown codes — caller (the router) is
56 expected to filter via _LANG_CAPABLE_BACKENDS / language preference
57 before selecting MeloTTS, so this default only triggers when caller
58 set language=None.
59 """
60 if not req_lang:
61 return _LANG_TO_MELO['en']
62 code = req_lang.replace('_', '-').split('-')[0].lower()
63 return _LANG_TO_MELO.get(code, _LANG_TO_MELO['en'])
66def _load():
67 """Load the English MeloTTS model on the best available device.
69 MeloTTS loads one language at a time. We default to English at
70 spawn; subsequent calls with a different language re-instantiate
71 the TTS class for that language inside `_synthesize`. The model
72 instance is stored on the cached object as `.tts` and the active
73 language on `.lang`.
75 On CPU machines MeloTTS still runs in real-time (~1× rtf), so
76 we don't hard-fail when CUDA is missing — let torch decide.
77 """
78 from melo.api import TTS # noqa: F401 — exception bubbles to worker
80 # Pick device based on CUDA availability. Worker self-reports
81 # post-load VRAM via the __WORKER_VRAM_GB__ marker so vram_manager
82 # auto-tightens the budget if we declared too high.
83 try:
84 import torch
85 device = 'cuda' if torch.cuda.is_available() else 'cpu'
86 except Exception:
87 device = 'cpu'
89 instance = TTS(language='EN', device=device)
91 # Box state in a small object so _synthesize can swap language
92 # without losing access to the device hint or the original class.
93 class _State:
94 def __init__(self_):
95 self_.tts = instance
96 self_.lang = 'EN'
97 self_.device = device
99 return _State()
102def _synthesize(state, req: dict) -> dict:
103 text = req.get('text', '')
104 if not text or not text.strip():
105 return {'error': 'Text is required'}
107 output_path = req.get('output_path')
108 if not output_path:
109 return {'error': 'output_path is required'}
111 melo_lang, speaker_key = _resolve_lang(req.get('language', 'en'))
113 # Re-instantiate on language switch (one model per language).
114 if melo_lang != state.lang:
115 from melo.api import TTS
116 state.tts = TTS(language=melo_lang, device=state.device)
117 state.lang = melo_lang
119 speaker_ids = state.tts.hps.data.spk2id
120 # Fall back to first speaker if the requested key isn't in this
121 # model's speaker list (e.g. EN-BR not present in some EN variants).
122 spk_id = speaker_ids.get(speaker_key) if hasattr(speaker_ids, 'get') \
123 else speaker_ids[speaker_key] if speaker_key in speaker_ids \
124 else None
125 if spk_id is None:
126 spk_id = next(iter(speaker_ids.values())) if speaker_ids else 0
128 speed = float(req.get('speed') or 1.0)
129 state.tts.tts_to_file(
130 text=text,
131 speaker_id=spk_id,
132 output_path=output_path,
133 speed=speed,
134 )
136 # Best-effort duration probe — soundfile is the lightest reader.
137 try:
138 import soundfile as _sf
139 info = _sf.info(output_path)
140 duration = round(info.frames / info.samplerate, 2)
141 sr = info.samplerate
142 except Exception:
143 duration = 0.0
144 sr = 24000
146 return {
147 'path': output_path,
148 'duration': duration,
149 'sample_rate': sr,
150 'engine': 'melotts',
151 'device': state.device,
152 'language': req.get('language', 'en'),
153 'voice': speaker_key,
154 }
157# ── Parent-side: one ToolWorker instance ─────────────────────────
159_tool = ToolWorker(
160 tool_name='melotts',
161 tool_module='integrations.service_tools.melotts_tool',
162 vram_budget='tts_melotts',
163 output_subdir='melotts/output',
164 engine='melotts',
165 startup_timeout=90.0,
166 request_timeout=90.0,
167)
170def melotts_synthesize(
171 text: str,
172 language: str = 'en',
173 voice: Optional[str] = None,
174 output_path: Optional[str] = None,
175 speed: float = 1.0,
176) -> str:
177 """Synthesize speech using MeloTTS (multilingual neural TTS).
179 Returns a JSON string compatible with the rest of the TTS tools.
180 On subprocess crash the response contains `transient: true` so
181 the caller (Nunba TTSEngine / HARTOS tts_router) can fall back.
182 """
183 return _tool.synthesize(
184 text=text,
185 language=language,
186 voice=voice,
187 output_path=output_path,
188 extra_request={'speed': speed} if speed != 1.0 else None,
189 )
192def unload_melotts():
193 """Stop the MeloTTS worker subprocess and release VRAM."""
194 _tool.stop()
197class MeloTTSTool:
198 """Register MeloTTS as an in-process service tool."""
200 @classmethod
201 def register_functions(cls):
202 from .registry import ServiceToolInfo, service_tool_registry
203 tool_info = ServiceToolInfo(
204 name="melotts",
205 description=(
206 "MeloTTS: myshell-ai multilingual neural TTS. "
207 "6 languages (en/es/fr/zh/ja/ko), ~1.5GB VRAM, runs on "
208 "CPU at real-time too. Multiple English accents "
209 "(US/BR/IN/AU). No voice cloning. "
210 "Requires: pip install melotts"
211 ),
212 base_url="inprocess://melotts",
213 endpoints={
214 "synthesize": {
215 "path": "/synthesize",
216 "method": "POST",
217 "description": "Synthesize with MeloTTS (multilingual, GPU/CPU).",
218 "params_schema": {
219 "text": {"type": "string"},
220 "language": {"type": "string"},
221 "voice": {"type": "string", "description": "Speaker id (EN-US, EN-BR, ...)"},
222 },
223 },
224 },
225 tags=["tts", "speech", "multilingual", "neural", "melotts"],
226 timeout=60,
227 )
228 tool_info.is_healthy = True
229 service_tool_registry._tools["melotts"] = tool_info
230 return True
233# NOTE: no `if __name__ == '__main__':` block — the gpu_worker
234# dispatcher picks up `_load` / `_synthesize` when invoked via
235# `python -m integrations.service_tools.gpu_worker
236# integrations.service_tools.melotts_tool`.