Coverage for integrations / channels / media / tts_router.py: 79.1%
521 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""Smart TTS Router — selects the best TTS engine based on constraints.
3Decision factors (in priority order):
41. Language — which engines support the target language?
52. Availability — is the engine installed locally?
63. Hardware — GPU present? Enough VRAM? CPU-only fallback?
74. Compute policy — local_only | local_preferred | any (hive offload)
85. Latency — instant (espeak/browser) vs quality (neural)
96. Voice cloning — only clone-capable engines if voice requested
107. Hive peers — offload to GPU peer when local can't serve
11"""
13import json
14import logging
15import time
16from dataclasses import dataclass, field
17from enum import Enum
18from typing import Any, Dict, List, Optional, Tuple
20logger = logging.getLogger(__name__)
22# ═══════════════════════════════════════════════════════════════
23# Source → Urgency mapping (backend auto-infers, frontends send source)
24# ═══════════════════════════════════════════════════════════════
26SOURCE_URGENCY: Dict[str, str] = {
27 'chat_response': 'normal', # Agent reply in chat
28 'notification': 'instant', # System notification
29 'greeting': 'instant', # Boot/login greeting
30 'read_aloud': 'quality', # User clicked "speak this"
31 'channel': 'normal', # Discord/Telegram response
32 'cli': 'quality', # hart voice "text"
33 'agent_tool': 'normal', # Agent using TTS tool
34}
36# ═══════════════════════════════════════════════════════════════
37# Engine Registry — static capabilities of every TTS engine
38# ═══════════════════════════════════════════════════════════════
40class TTSDevice(Enum):
41 GPU_ONLY = "gpu_only"
42 GPU_PREFERRED = "gpu_preferred" # works on CPU too, GPU better
43 CPU_ONLY = "cpu_only"
44 CLOUD = "cloud"
47@dataclass(frozen=True)
48class TTSEngineSpec:
49 """Static specification of a TTS engine's capabilities."""
50 engine_id: str
51 device: TTSDevice
52 vram_key: str # key in VRAM_BUDGETS (vram_manager.py)
53 languages: Tuple[str, ...] # ISO 639-1 codes, or ('*',) for all
54 quality: float # 0.0-1.0 subjective quality score
55 voice_clone: bool
56 latency_gpu_ms: int # estimated latency on GPU (0 if N/A)
57 latency_cpu_ms: int # estimated latency on CPU (0 if N/A)
58 latency_cloud_ms: int # estimated latency on cloud (0 if N/A)
59 tool_module: Optional[str] # Python module path for the tool
60 tool_function: Optional[str] # parent-side synthesize function name
61 tool_worker_attr: Optional[str] = None # ToolWorker attribute name
62 # on the tool module; None for
63 # CPU-only engines that have no
64 # subprocess worker.
65 required_package: Optional[str] = None # pip package name that must be
66 # importable at runtime; None
67 # for engines whose deps are
68 # bundled (e.g. piper) or CPU-only
69 # with no extra deps.
70 pip_install_plan: Tuple[str, ...] = () # canonical pip-spec list to make
71 # `required_package` actually
72 # importable + synth-functional —
73 # includes transitive deps the
74 # upstream package may forget to
75 # declare in its install_requires
76 # (e.g. chatterbox-tts ships
77 # `import librosa` in tts.py but
78 # doesn't list librosa as a hard
79 # dep, so a no-deps pip install
80 # leaves a broken package on disk
81 # that imports far enough for
82 # find_spec() but blows up on
83 # actual synthesize calls — see
84 # ~/Documents/Nunba/logs/probe_
85 # chatterbox_turbo.err). Single
86 # source of truth for the desktop
87 # installer (Nunba) so it doesn't
88 # carry a parallel dict that
89 # drifts. Empty tuple = nothing
90 # to install (bundled / CPU stub).
91 install_target: str = 'main' # WHERE pip_install_plan should
92 # land on the desktop installer.
93 # Valid values:
94 # 'main' — into the main
95 # python-embed
96 # site-packages
97 # (legacy default;
98 # risky, dep
99 # conflicts mask
100 # silent failures)
101 # 'venv' — into a private
102 # venv at
103 # ~/Documents/
104 # Nunba/data/
105 # venvs/<engine>/.
106 # Requires a
107 # per-engine
108 # worker file
109 # (tts/<engine>_
110 # worker.py) that
111 # the parent
112 # dispatches into
113 # via backend_
114 # venv.invoke_in_
115 # venv().
116 # 'bundled' — already on
117 # disk via the
118 # frozen build
119 # (piper voices,
120 # luxtts, espeak)
121 # 'cloud' — HTTP-only,
122 # nothing to
123 # install
124 # (makeittalk)
125 # 'git_clone' — needs git clone
126 # of an upstream
127 # repo + pip
128 # install -e
129 # (cosyvoice3 →
130 # FunAudioLLM/
131 # CosyVoice)
132 # Default 'main' preserves
133 # current behavior; flipping a
134 # GPU engine to 'venv' requires
135 # the matching worker file in
136 # Nunba (or the dispatch falls
137 # back to in-process import,
138 # which only works if the engine
139 # is also installed in main).
140 sample_rate: int = 24000
143# Shared pip-spec constants — keep here so the install plans below stay
144# readable and so a single edit updates every engine that pins them.
145#
146# huggingface_hub 0.29+ removes is_offline_mode that transformers <5.x
147# still imports, so we cap below 0.29 for the chatterbox / kokoro chain.
148_HF_HUB_PIN = 'huggingface_hub>=0.27.0,<0.29.0'
150# Chatterbox plan — `chatterbox-tts` on PyPI omits MULTIPLE runtime
151# imports from its install_requires. Each one only surfaces when the
152# install proceeds far enough for the next one to be reached:
153#
154# chatterbox/__init__.py:9 → from .tts import ChatterboxTTS
155# chatterbox/tts.py:4 → import librosa (missing #1)
156# chatterbox/tts.py:6 → import perth (missing #2)
157#
158# Each was discovered from a real failed install at
159# ~/Documents/Nunba/logs/probe_chatterbox_turbo.err on the user's
160# desktop — first librosa, then once that was added, perth. Listing
161# them all here means a fresh chatterbox install completes in one
162# pip pass instead of needing 2-3 self-heal iterations (each of
163# which downloads ~10 MB of pip metadata). The Nunba self-heal
164# loop catches future un-declared transitives on the install screen
165# without surfacing a synth failure to the user.
166_CHATTERBOX_PIP_PLAN: Tuple[str, ...] = (
167 _HF_HUB_PIN,
168 'torchaudio',
169 'chatterbox-tts',
170 'librosa', # missing transitive #1 — chatterbox/tts.py:4
171 'soundfile', # librosa needs it on Windows for non-WAV outputs
172 'resemble-perth', # missing transitive #2 — chatterbox/tts.py:6
173 # `import perth`; PyPI pkg name = resemble-perth
174 # (the watermark library Resemble AI uses to
175 # tag synthesized audio).
176 # NOTE on the rest of chatterbox-tts==0.1.7's requires_dist
177 # (omegaconf, conformer, pyloudnorm, pykakasi, spacy-pkuseg,
178 # diffusers, einops, s3tokenizer, etc.):
179 # We deliberately do NOT pre-install them in one pip pass.
180 # When pip is asked to install many at once with
181 # `--no-build-isolation` (frozen build constraint, see
182 # package_installer._run_pip), and one of the transitives needs
183 # a source build (omegaconf → antlr4-python3-runtime==4.9.* is
184 # sdist-only on PyPI), pip's parallel-builds path races against
185 # the bundle's setuptools and surfaces as
186 # BackendUnavailable: Cannot import 'setuptools.build_meta'
187 # (observed 2026-04-28 on the user's bundle f2d4567 — full pip
188 # invocation aborts rc=2, no transitive gets installed).
189 # _self_heal_missing_transitives in package_installer.py handles
190 # them one-at-a-time AFTER the chatterbox-tts top-level install
191 # — single-package mode never triggers the parallel-build race.
192 # Combined with the PYTHONNOUSERSITE=1 fix in tts/_torch_probe.py
193 # (probe no longer leaks system Python's site-packages), each
194 # heal cycle finds a REAL missing transitive, not a phantom one.
195 # The original 5-cycle trail (librosa → perth → einops →
196 # s3tokenizer → omegaconf) is fine because each cycle resolves
197 # in ~10-30s of single-package pip work, not 5 minutes of
198 # parallel-build resolver thrash.
199)
202# All known TTS engines
203ENGINE_REGISTRY: Dict[str, TTSEngineSpec] = {
204 'chatterbox_turbo': TTSEngineSpec(
205 engine_id='chatterbox_turbo',
206 device=TTSDevice.GPU_ONLY,
207 vram_key='tts_chatterbox_turbo',
208 languages=('en',),
209 quality=0.95,
210 voice_clone=True,
211 latency_gpu_ms=150,
212 latency_cpu_ms=0,
213 latency_cloud_ms=0,
214 tool_module='integrations.service_tools.chatterbox_tool',
215 tool_function='chatterbox_synthesize',
216 tool_worker_attr='_turbo',
217 required_package='chatterbox',
218 pip_install_plan=_CHATTERBOX_PIP_PLAN,
219 # chatterbox-tts 0.1.7 hard-pins torch==2.6.0, transformers==5.2.0,
220 # numpy<2.0.0, diffusers==0.29.0, safetensors==0.5.3 — all in
221 # direct conflict with HARTOS's main interpreter (torch 2.11,
222 # transformers 5.1, numpy 2.4, diffusers 0.37, safetensors 0.7).
223 # Auto-heal can never satisfy these because main-interpreter
224 # downgrades would break llama-server, indic_parler, faster-whisper,
225 # and every other ML stack. Quarantine into its own venv —
226 # same pattern indic_parler uses (parler-tts pinned
227 # transformers<4.47). Nunba's tts/package_installer.py routes
228 # the install into ~/.nunba/venvs/chatterbox_turbo/, and the
229 # HARTOS ToolWorker's python_exe is set to the venv's python
230 # at runtime via desktop/_wire_venv_engines.py at boot, so the
231 # synth subprocess sees the pinned chatterbox-compatible deps
232 # instead of the main interpreter's incompatible newer ones.
233 install_target='venv',
234 ),
235 # luxtts REMOVED — poor audio quality, not suitable for any use case.
236 'cosyvoice3': TTSEngineSpec(
237 engine_id='cosyvoice3',
238 device=TTSDevice.GPU_ONLY,
239 vram_key='tts_cosyvoice3',
240 languages=('zh', 'ja', 'ko', 'de', 'es', 'fr', 'it', 'ru', 'en'),
241 quality=0.92,
242 voice_clone=True,
243 latency_gpu_ms=200,
244 latency_cpu_ms=0,
245 latency_cloud_ms=0,
246 tool_module='integrations.service_tools.cosyvoice_tool',
247 tool_function='cosyvoice_synthesize',
248 tool_worker_attr='_tool',
249 required_package='cosyvoice',
250 # cosyvoice is NOT pip-installable — needs a `git clone` of
251 # FunAudioLLM/CosyVoice plus model weight download via
252 # huggingface_hub. Empty plan + install_target='git_clone'
253 # signals Nunba to skip the pip path entirely and route
254 # through its git-clone install handler instead. The
255 # verify-synth probe must also short-circuit on git_clone
256 # engines when the package isn't importable (current Nunba
257 # bug: probe runs `import cosyvoice` blindly + always fails).
258 pip_install_plan=(),
259 install_target='git_clone',
260 ),
261 'f5_tts': TTSEngineSpec(
262 engine_id='f5_tts',
263 device=TTSDevice.GPU_ONLY,
264 vram_key='tts_f5',
265 languages=('en', 'zh'),
266 quality=0.91,
267 voice_clone=True,
268 latency_gpu_ms=200,
269 latency_cpu_ms=0,
270 latency_cloud_ms=0,
271 tool_module='integrations.service_tools.f5_tts_tool',
272 tool_function='f5_synthesize',
273 tool_worker_attr='_tool',
274 required_package='f5_tts',
275 pip_install_plan=('torchaudio', 'f5-tts'),
276 ),
277 'indic_parler': TTSEngineSpec(
278 engine_id='indic_parler',
279 device=TTSDevice.GPU_ONLY,
280 vram_key='tts_indic_parler',
281 languages=(
282 'hi', 'ta', 'te', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ur',
283 'as', 'bho', 'doi', 'kok', 'mai', 'mni', 'ne', 'sa', 'sat', 'sd', 'en',
284 ),
285 quality=0.90,
286 voice_clone=False,
287 latency_gpu_ms=300,
288 latency_cpu_ms=0,
289 latency_cloud_ms=0,
290 tool_module='integrations.service_tools.indic_parler_tool',
291 tool_function='indic_parler_synthesize',
292 tool_worker_attr='_tool',
293 required_package='parler_tts',
294 # Indic Parler quarantines into its own venv on the desktop —
295 # parler-tts 0.2.2 hard-pins transformers<4.47 which conflicts
296 # with the main interpreter's transformers 5.1.0. The full
297 # pip plan lives here so it travels with the engine spec; the
298 # desktop installer routes the install into the venv when
299 # install_target='venv'. Worker file:
300 # tts/indic_parler_worker.py (Nunba). HARTOS server side runs
301 # Indic Parler in its own subprocess worker so the main
302 # interpreter pin doesn't apply there either.
303 pip_install_plan=(
304 # tqdm + colorama pinned FIRST to stop pip's resolver from
305 # backtracking through colorama 0.1.x (no setup.py, breaks
306 # install). Witnessed user-facing failure:
307 # "Indic Parler TTS unavailable — using fallback voice engine"
308 # Root-caused from ~/Documents/Nunba/logs/venv_indic_parler.log.
309 'colorama>=0.4.6',
310 'tqdm>=4.65',
311 'transformers==4.46.1', # parler-tts 0.2.2 requires <4.47
312 'torch', # CPU-ish fallback; replaced by CUDA if GPU
313 'torchaudio',
314 'sentencepiece',
315 'descript-audio-codec',
316 'parler-tts==0.2.2', # 0.2.3 has DacModel.decode() API mismatch
317 'soundfile',
318 _HF_HUB_PIN,
319 ),
320 install_target='venv',
321 ),
322 'chatterbox_ml': TTSEngineSpec(
323 engine_id='chatterbox_ml',
324 device=TTSDevice.GPU_ONLY,
325 vram_key='tts_chatterbox_ml',
326 languages=(
327 'en', 'zh', 'ja', 'ko', 'de', 'es', 'fr', 'it', 'ru', 'pt',
328 'ar', 'nl', 'pl', 'sv', 'tr', 'hi', 'ta', 'te', 'bn', 'id',
329 'th', 'vi', 'cs',
330 ),
331 quality=0.94,
332 voice_clone=True,
333 latency_gpu_ms=300,
334 latency_cpu_ms=0,
335 latency_cloud_ms=0,
336 tool_module='integrations.service_tools.chatterbox_tool',
337 tool_function='chatterbox_ml_synthesize',
338 tool_worker_attr='_ml',
339 required_package='chatterbox',
340 pip_install_plan=_CHATTERBOX_PIP_PLAN,
341 ),
342 'pocket_tts': TTSEngineSpec(
343 engine_id='pocket_tts',
344 device=TTSDevice.CPU_ONLY,
345 vram_key='',
346 languages=('en',),
347 quality=0.85,
348 voice_clone=True,
349 latency_gpu_ms=0,
350 latency_cpu_ms=200,
351 latency_cloud_ms=0,
352 tool_module='integrations.service_tools.pocket_tts_tool',
353 tool_function='pocket_tts_synthesize',
354 pip_install_plan=('pocket-tts',),
355 ),
356 # NeuTTS Air — Neuphonic 748M-param Qwen2 backbone with NeuCodec
357 # decoder, Apache 2.0. GGUF Q4 (~600MB) / Q8 (~800MB). RTF<0.5
358 # on CPU (Intel i5 / RPi 5), 24kHz output, instant voice cloning
359 # from 3-15s reference audio. English primary. Slots between
360 # omnivoice and kokoro on the English ladder per quality
361 # (kokoro=0.88, neutts=0.91, omnivoice~0.93, chatterbox=0.95).
362 #
363 # Reference voice contract: NeuTTS requires a reference audio +
364 # transcript per call (no built-in 'alba'-style zero-config
365 # voices). The wrapper resolves 'jo' (upstream sample shipped
366 # with the package), any path to a .wav with companion .txt,
367 # or a custom name from ~/.hevolve/models/tts/neutts/voices/.
368 # See integrations/service_tools/neutts_tool.py for resolution.
369 'neutts_air': TTSEngineSpec(
370 engine_id='neutts_air',
371 device=TTSDevice.GPU_PREFERRED,
372 vram_key='tts_neutts',
373 languages=('en',),
374 quality=0.91,
375 voice_clone=True,
376 latency_gpu_ms=150,
377 latency_cpu_ms=400,
378 latency_cloud_ms=0,
379 tool_module='integrations.service_tools.neutts_tool',
380 tool_function='neutts_synthesize',
381 # Worker attribute — Nunba's `_SubprocessTTSBackend` needs this
382 # to drive the subprocess. Without it the spec falls into the
383 # `_InProcessTTSBackend` path (line ~2408 of Nunba's tts_engine
384 # .py) which in turn does `import neutts` from the MAIN
385 # interpreter — a guaranteed ImportError because `install_target
386 # ='venv'` lands the package in the per-engine venv, not the
387 # main python-embed. Setting `_tool` here pairs cleanly with
388 # the ToolWorker singleton in integrations.service_tools.neutts
389 # _tool, mirroring kokoro / chatterbox / f5 / indic_parler.
390 tool_worker_attr='_tool',
391 required_package='neutts',
392 # `neutts[all]` pulls llama-cpp-python (for GGUF inference)
393 # plus soundfile + onnxruntime. The base `neutts` package
394 # alone is not enough for synth — the codec decoder needs
395 # onnxruntime. Pin huggingface_hub via _HF_HUB_PIN so the
396 # transformers chain stays consistent with the rest of the
397 # English ladder (chatterbox / kokoro use the same pin).
398 pip_install_plan=(
399 _HF_HUB_PIN,
400 'neutts[all]',
401 'soundfile', # explicit — wrapper requires soundfile.write
402 ),
403 # Quarantine into its own venv on the desktop installer.
404 # NeuTTS pulls llama-cpp-python which can drift from the
405 # main interpreter's torch / numpy stack. Same pattern as
406 # chatterbox_turbo and indic_parler.
407 install_target='venv',
408 ),
409 # Kokoro 82M — tiny neural English TTS. Runs on CPU (≈1× real-time,
410 # 200MB RAM) or GPU (≈0.1× real-time, 200MB VRAM). Quality sits
411 # above Piper and below the big voice-clone engines, so it's the
412 # right second rung on the English ladder — tried when the GPU
413 # engines can't run (no CUDA, VRAM full, package missing) but
414 # BEFORE we fall all the way down to Piper.
415 #
416 # Benchmark context (vs piper, on English):
417 # - quality: kokoro 0.88 vs piper 0.70 (subjective MOS gap)
418 # - cpu latency: kokoro 400ms vs piper 200ms (per ~10 words)
419 # - disk: kokoro 160MB vs piper 60MB (per voice)
420 # - voices: kokoro ~25 vs piper ~15 (per-language catalog)
421 # Piper still wins on raw CPU speed and disk, which is why it
422 # stays the absolute last-resort fallback.
423 'kokoro': TTSEngineSpec(
424 engine_id='kokoro',
425 device=TTSDevice.GPU_PREFERRED,
426 vram_key='tts_kokoro',
427 languages=('en',),
428 quality=0.88,
429 voice_clone=False,
430 latency_gpu_ms=120,
431 latency_cpu_ms=400,
432 latency_cloud_ms=0,
433 tool_module='integrations.service_tools.kokoro_tool',
434 tool_function='kokoro_synthesize',
435 tool_worker_attr='_tool',
436 required_package='kokoro',
437 pip_install_plan=(
438 _HF_HUB_PIN,
439 'kokoro', # pulls misaki phonemizer transitively
440 'espeakng', # espeak-ng Python bindings (ships binary on Windows)
441 ),
442 ),
443 # OmniVoice — universal TTS. Qwen3-0.6B backbone + diffusion head,
444 # 646 languages (581k training hours spanning every Indic script,
445 # zh/ja/ko, European, Arabic, low-resource). Zero-shot voice cloning
446 # from 3-10 s of reference audio. Apache 2.0.
447 #
448 # Languages tuple is ('*',) — same wildcard convention as espeak —
449 # but select_engines() only considers engines explicitly listed in
450 # LANG_ENGINE_PREFERENCE for the resolved language. We prepend
451 # 'omnivoice' to every Indic + non-English entry + _DEFAULT_PREFERENCE
452 # so it wins unless it's uninstalled or the GPU can't hold it.
453 #
454 # VRAM is stubbed at 3.0 GB in vram_manager.VRAM_BUDGETS; the worker
455 # self-reports actual usage on first load via '__WORKER_VRAM_GB__'
456 # and vram_manager.record_actual_usage tightens the budget.
457 'omnivoice': TTSEngineSpec(
458 engine_id='omnivoice',
459 device=TTSDevice.GPU_ONLY,
460 vram_key='tts_omnivoice',
461 languages=('*',), # 646 languages
462 quality=0.93,
463 voice_clone=True,
464 latency_gpu_ms=250,
465 latency_cpu_ms=0,
466 latency_cloud_ms=0,
467 tool_module='integrations.service_tools.omnivoice_tool',
468 tool_function='omnivoice_synthesize',
469 tool_worker_attr='_tool',
470 required_package='omnivoice',
471 # See omnivoice_tool.py docstring: "Requires: pip install
472 # omnivoice torch soundfile". torch is bundled.
473 pip_install_plan=('omnivoice', 'soundfile'),
474 ),
475 'espeak': TTSEngineSpec(
476 engine_id='espeak',
477 device=TTSDevice.CPU_ONLY,
478 vram_key='',
479 languages=('*',), # 100+ languages
480 quality=0.40,
481 voice_clone=False,
482 latency_gpu_ms=0,
483 latency_cpu_ms=10,
484 latency_cloud_ms=0,
485 tool_module='integrations.service_tools.pocket_tts_tool',
486 tool_function='pocket_tts_synthesize', # espeak is fallback inside pocket
487 install_target='bundled',
488 ),
489 'makeittalk': TTSEngineSpec(
490 engine_id='makeittalk',
491 device=TTSDevice.CLOUD,
492 vram_key='',
493 languages=('en',),
494 quality=0.88,
495 voice_clone=False,
496 latency_gpu_ms=0,
497 latency_cpu_ms=0,
498 latency_cloud_ms=5000,
499 tool_module=None, # Special cloud path in model_bus_service
500 tool_function=None,
501 install_target='cloud',
502 ),
503 # Piper — bundled CPU engine, multilingual via downloadable voice
504 # files. Uses ('*',) wildcard (same convention as espeak) so one
505 # spec covers every language Piper has voices for — no parallel
506 # per-language list. Runtime synth attempt raises on missing voice
507 # files and the router falls through to a neural engine.
508 'piper': TTSEngineSpec(
509 engine_id='piper',
510 device=TTSDevice.CPU_ONLY,
511 vram_key='',
512 languages=('*',),
513 quality=0.70,
514 voice_clone=False,
515 latency_gpu_ms=0,
516 latency_cpu_ms=200,
517 latency_cloud_ms=0,
518 tool_module=None, # In-process via Nunba tts/piper_tts.py —
519 # no subprocess worker, no required_package.
520 tool_function=None,
521 install_target='bundled',
522 ),
523 # ── Mid-VRAM coverage tier (1–3 GB) ───────────────────────────
524 # These three engines fill the gap so every SUPPORTED_LANG_DICT
525 # code has at least one engine with vram_gb≤3.0 in its preference
526 # ladder. Indic Parler (2.0) + F5 (2.5) cover en/zh + 22 Indic;
527 # the trio below adds the rest of the major language families
528 # without forcing users onto the 12-14 GB Chatterbox-ML or the
529 # uninstallable git-clone CosyVoice path.
530 'melotts': TTSEngineSpec(
531 engine_id='melotts',
532 device=TTSDevice.GPU_PREFERRED, # works on CPU at real-time too
533 vram_key='tts_melotts',
534 languages=('en', 'es', 'fr', 'zh', 'ja', 'ko'),
535 quality=0.86,
536 voice_clone=False,
537 latency_gpu_ms=180,
538 latency_cpu_ms=600,
539 latency_cloud_ms=0,
540 tool_module='integrations.service_tools.melotts_tool',
541 tool_function='melotts_synthesize',
542 tool_worker_attr='_tool',
543 required_package='melo', # `from melo.api import TTS`
544 pip_install_plan=(
545 _HF_HUB_PIN,
546 'melotts', # PyPI package; ships `melo` import root
547 'soundfile', # used for duration probe
548 ),
549 ),
550 'xtts_v2': TTSEngineSpec(
551 engine_id='xtts_v2',
552 device=TTSDevice.GPU_ONLY,
553 vram_key='tts_xtts_v2',
554 languages=(
555 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',
556 'cs', 'ar', 'zh', 'hu', 'ko', 'ja', 'hi',
557 ),
558 quality=0.92,
559 voice_clone=True,
560 latency_gpu_ms=350,
561 latency_cpu_ms=0,
562 latency_cloud_ms=0,
563 tool_module='integrations.service_tools.xtts_tool',
564 tool_function='xtts_synthesize',
565 tool_worker_attr='_tool',
566 required_package='TTS', # `from TTS.api import TTS`
567 pip_install_plan=(
568 _HF_HUB_PIN,
569 'coqui-tts', # idiap-maintained 2026 fork on PyPI;
570 # ships `from TTS.api import TTS` so
571 # the import path is stable.
572 'soundfile',
573 ),
574 ),
575 'mms_tts': TTSEngineSpec(
576 engine_id='mms_tts',
577 device=TTSDevice.GPU_PREFERRED, # CPU works, GPU faster
578 vram_key='tts_mms_tts',
579 languages=(
580 # Roman-script languages where mms_tts_tool routes without
581 # uroman. Non-Roman scripts (ar/hi/zh/ko/ja/...) ALSO have
582 # mms-tts checkpoints but require uroman pre-processing —
583 # the tool gracefully fails when uroman isn't installed and
584 # the router falls through to the next preference. We list
585 # the broader set here because the tool decides per-call
586 # whether it can serve; the router's job is to attempt.
587 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',
588 'cs', 'hu', 'sv', 'fi', 'el', 'ro', 'bg', 'uk', 'cy', 'is',
589 'zh', 'ja', 'ko', 'vi', 'th', 'id', 'ms', 'km', 'lo', 'my',
590 'hi', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or',
591 'ne', 'as', 'sd', 'sa', 'ur', 'si',
592 'ar', 'fa', 'he', 'sw',
593 ),
594 quality=0.78,
595 voice_clone=False,
596 latency_gpu_ms=200,
597 latency_cpu_ms=500,
598 latency_cloud_ms=0,
599 tool_module='integrations.service_tools.mms_tts_tool',
600 tool_function='mms_tts_synthesize',
601 tool_worker_attr='_tool',
602 required_package='transformers', # already bundled — no install plan
603 pip_install_plan=(
604 _HF_HUB_PIN,
605 'soundfile', # for WAV write
606 # uroman is OPTIONAL — only needed for non-Roman scripts.
607 # The tool falls through cleanly when missing, so we don't
608 # bundle the perl repo + extra pip dep into every install.
609 # Users who want broad Indic/Arabic/CJK coverage from MMS
610 # specifically can `pip install uroman` separately.
611 ),
612 ),
613}
616# ═══════════════════════════════════════════════════════════════
617# Language → Engine Preference Table
618# ═══════════════════════════════════════════════════════════════
620# Ordered by quality for each language — first available wins
621LANG_ENGINE_PREFERENCE: Dict[str, List[str]] = {
622 # English ladder (quality-then-speed):
623 # 1. chatterbox_turbo — big GPU voice-clone, highest quality
624 # 2. kokoro — 82M neural, CPU-friendly, best non-GPU quality
625 # 3. pocket_tts — small cloneable fallback
626 # 4. cosyvoice3 — big multilingual GPU, usable for EN
627 # 5. piper — bundled CPU fallback, always ships
628 # 6. espeak — absolute last-resort phoneme synth
629 # luxtts dropped from default ladder (poor naturalness); still available
630 # for explicit voice-clone requests via direct engine selection.
631 # chatterbox_turbo wins on English quality; omnivoice sits above
632 # kokoro/pocket/cosyvoice for cross-engine consistency when the
633 # user also runs non-English traffic and we want to avoid swapping
634 # engines on every language switch.
635 # neutts_air slotted between omnivoice and melotts:
636 # - quality: chatterbox_turbo=0.95 > omnivoice~0.93 > neutts=0.91 > kokoro=0.88
637 # - cpu RTF: neutts <0.5 (acceptable on CPU for 8GB+ machines)
638 # - install: pip neutts[all] (GGUF Q4 ~600MB), Apache 2.0
639 # Behavior on missing package: wrapper returns clean {error: ...}
640 # JSON; ladder traverses to next engine — verified against
641 # tts/package_installer.py per-engine independent install
642 # contract + tts_engine._synthesize_with_fallback ladder walk.
643 'en': ['chatterbox_turbo', 'omnivoice', 'neutts_air', 'melotts', 'xtts_v2', 'kokoro', 'pocket_tts', 'cosyvoice3', 'mms_tts', 'piper', 'espeak'],
644 # Indic languages — omnivoice replaces indic_parler as the primary
645 # (parler kept as fallback for one release cycle). OmniVoice has
646 # 100-400 training hours per major Indic language vs parler's ~10,
647 # and adds voice cloning which parler lacks entirely. XTTS-v2
648 # adds Hindi (only); MMS-TTS adds the rest as 1 GB-tier coverage.
649 'hi': ['omnivoice', 'indic_parler', 'xtts_v2', 'chatterbox_ml', 'cosyvoice3', 'mms_tts', 'espeak'],
650 'ta': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
651 'te': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
652 'bn': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
653 'gu': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
654 'kn': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
655 'ml': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
656 'mr': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
657 'or': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
658 'pa': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
659 'ur': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
660 'as': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
661 'ne': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
662 'sa': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],
663 'si': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], # Sinhala — mms-tts adds 1 GB-tier
664 'sd': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], # Sindhi — Indic Parler + mms
665 # CJK — omnivoice has 500k+ hours of CJK in training; promote over cosyvoice.
666 # MeloTTS slots above the heavy Chatterbox-ML for the 1.5 GB tier.
667 'zh': ['omnivoice', 'melotts', 'cosyvoice3', 'f5_tts', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
668 'ja': ['omnivoice', 'melotts', 'cosyvoice3', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
669 'ko': ['omnivoice', 'melotts', 'cosyvoice3', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
670 # European — XTTS-v2 (2.5 GB, voice clone) and MeloTTS (1.5 GB)
671 # slot above the 12 GB Chatterbox-ML so users on 4-8 GB GPUs get
672 # quality TTS without the 14 GB allocation that pushes other
673 # workers off the GPU.
674 'de': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],
675 'es': ['omnivoice', 'melotts', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],
676 'fr': ['omnivoice', 'melotts', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],
677 'it': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],
678 'ru': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],
679 'pt': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
680 'ar': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
681 'nl': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
682 'pl': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
683 'sv': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],
684 'tr': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
685 'id': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],
686 'th': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],
687 'vi': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],
688 'cs': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
689 # Newly-covered SUPPORTED_LANG_DICT entries — these had no
690 # explicit ladder before and would have hit _DEFAULT_PREFERENCE
691 # (omnivoice → chatterbox_ml → espeak), where chatterbox_ml needs
692 # 14 GB. MMS-TTS at 1 GB now provides the always-runnable fallback.
693 'hu': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
694 'el': ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'],
695 'fi': ['omnivoice', 'mms_tts', 'espeak'],
696 'ro': ['omnivoice', 'mms_tts', 'espeak'],
697 'bg': ['omnivoice', 'mms_tts', 'espeak'],
698 'uk': ['omnivoice', 'mms_tts', 'espeak'],
699 'cy': ['omnivoice', 'mms_tts', 'espeak'], # Welsh
700 'is': ['omnivoice', 'mms_tts', 'espeak'], # Icelandic
701 'ms': ['omnivoice', 'mms_tts', 'espeak'], # Malay
702 'fa': ['omnivoice', 'mms_tts', 'espeak'], # Persian (uroman)
703 'he': ['omnivoice', 'mms_tts', 'espeak'], # Hebrew (uroman)
704 'sw': ['omnivoice', 'mms_tts', 'espeak'], # Swahili
705 'km': ['omnivoice', 'mms_tts', 'espeak'], # Khmer (uroman)
706 'lo': ['omnivoice', 'mms_tts', 'espeak'], # Lao (uroman)
707 'my': ['omnivoice', 'mms_tts', 'espeak'], # Burmese (uroman)
708 # Additional Indic codes that exist in SUPPORTED_LANG_DICT but
709 # weren't in the language preference table previously — these
710 # ride Indic Parler's 22-language coverage, then mms_tts.
711 'brx': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Bodo
712 'doi': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Dogri
713 'kok': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Konkani
714 'mai': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Maithili
715 'mni': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Manipuri
716 'sat': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Santali
717 'ks': ['omnivoice', 'mms_tts', 'espeak'], # Kashmiri
718 # Misc that were previously routed via _DEFAULT_PREFERENCE only.
719 'lv': ['omnivoice', 'mms_tts', 'espeak'], # Latvian
720 'sr': ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'], # Serbian
721 'zh-cn': ['omnivoice', 'melotts', 'cosyvoice3', 'f5_tts', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],
722}
724# Fallback for unlisted languages — omnivoice covers 646 + mms_tts covers
725# 1100+, so this is reached only when both are uninstalled / can't fit.
726# chatterbox_ml is the heaviest local clone, espeak is the absolute floor.
727_DEFAULT_PREFERENCE = ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak']
730# ═══════════════════════════════════════════════════════════════
731# Route result
732# ═══════════════════════════════════════════════════════════════
734class TTSLocation(Enum):
735 LOCAL = "local"
736 HIVE_PEER = "hive_peer"
737 CLOUD = "cloud"
740@dataclass
741class TTSCandidate:
742 """A scored TTS engine candidate."""
743 engine: TTSEngineSpec
744 location: TTSLocation
745 device: str # 'gpu', 'cpu', 'cloud'
746 estimated_latency_ms: int
747 quality_score: float
748 peer_address: Optional[str] = None # if location == HIVE_PEER
749 warnings: List[str] = field(default_factory=list)
752@dataclass
753class TTSResult:
754 """Result of a TTS synthesis."""
755 path: str
756 duration: float
757 engine_id: str
758 device: str
759 location: str
760 latency_ms: float
761 sample_rate: int
762 voice: str
763 quality_score: float
764 warnings: List[str] = field(default_factory=list)
765 error: Optional[str] = None
767 def to_dict(self) -> Dict[str, Any]:
768 d = {
769 'path': self.path,
770 'duration': self.duration,
771 'engine': self.engine_id,
772 'device': self.device,
773 'location': self.location,
774 'latency_ms': self.latency_ms,
775 'sample_rate': self.sample_rate,
776 'voice': self.voice,
777 'quality_score': self.quality_score,
778 }
779 if self.warnings:
780 d['warnings'] = self.warnings
781 if self.error:
782 d['error'] = self.error
783 return d
786# ═══════════════════════════════════════════════════════════════
787# Language Detection
788# ═══════════════════════════════════════════════════════════════
790def detect_language(text: str) -> str:
791 """Detect language of text. Returns ISO 639-1 code (e.g. 'en', 'hi').
793 Uses langdetect if available, falls back to heuristics.
794 """
795 if not text or not text.strip():
796 return 'en'
797 try:
798 from langdetect import detect
799 return detect(text)
800 except ImportError:
801 pass
802 except Exception:
803 pass
805 # Heuristic fallback: check Unicode script ranges
806 sample = text[:500]
807 devanagari = sum(1 for c in sample if '\u0900' <= c <= '\u097F')
808 cjk = sum(1 for c in sample if '\u4E00' <= c <= '\u9FFF')
809 hangul = sum(1 for c in sample if '\uAC00' <= c <= '\uD7AF')
810 katakana = sum(1 for c in sample if '\u30A0' <= c <= '\u30FF')
811 hiragana = sum(1 for c in sample if '\u3040' <= c <= '\u309F')
812 tamil = sum(1 for c in sample if '\u0B80' <= c <= '\u0BFF')
813 telugu = sum(1 for c in sample if '\u0C00' <= c <= '\u0C7F')
814 arabic = sum(1 for c in sample if '\u0600' <= c <= '\u06FF')
815 cyrillic = sum(1 for c in sample if '\u0400' <= c <= '\u04FF')
816 bengali = sum(1 for c in sample if '\u0980' <= c <= '\u09FF')
817 gujarati = sum(1 for c in sample if '\u0A80' <= c <= '\u0AFF')
818 kannada = sum(1 for c in sample if '\u0C80' <= c <= '\u0CFF')
819 malayalam = sum(1 for c in sample if '\u0D00' <= c <= '\u0D7F')
821 threshold = max(3, len(sample) * 0.1)
822 if devanagari > threshold:
823 return 'hi'
824 if tamil > threshold:
825 return 'ta'
826 if telugu > threshold:
827 return 'te'
828 if bengali > threshold:
829 return 'bn'
830 if gujarati > threshold:
831 return 'gu'
832 if kannada > threshold:
833 return 'kn'
834 if malayalam > threshold:
835 return 'ml'
836 if cjk > threshold:
837 return 'zh'
838 if hangul > threshold:
839 return 'ko'
840 if (katakana + hiragana) > threshold:
841 return 'ja'
842 if arabic > threshold:
843 return 'ar'
844 if cyrillic > threshold:
845 return 'ru'
846 return 'en'
849# ═══════════════════════════════════════════════════════════════
850# Engine Availability Detection
851# ═══════════════════════════════════════════════════════════════
853# Cache for engine availability (avoid repeated import checks)
854_engine_available_cache: Dict[str, Tuple[bool, float]] = {}
855_CACHE_TTL = 60.0 # seconds
858def _is_engine_installed(engine_id: str) -> bool:
859 """Check if a TTS engine's Python package is available.
861 TODO REFACTOR: move to model_catalog as ModelEntry.is_installed() —
862 a model that isn't pip-importable shouldn't be selectable by any caller.
863 """
864 now = time.time()
865 cached = _engine_available_cache.get(engine_id)
866 if cached and (now - cached[1]) < _CACHE_TTL:
867 return cached[0]
869 spec = ENGINE_REGISTRY.get(engine_id)
870 if not spec or not spec.tool_module:
871 _engine_available_cache[engine_id] = (False, now)
872 return False
874 available = False
875 try:
876 if engine_id == 'espeak':
877 # espeak availability checked via shutil
878 import shutil
879 available = shutil.which('espeak-ng') is not None or shutil.which('espeak') is not None
880 elif engine_id == 'pocket_tts':
881 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize # noqa: F401
882 available = True
883 elif engine_id == 'luxtts':
884 from integrations.service_tools.luxtts_tool import luxtts_synthesize # noqa: F401
885 available = True
886 elif engine_id == 'cosyvoice3':
887 from integrations.service_tools.cosyvoice_tool import cosyvoice_synthesize # noqa: F401
888 available = True
889 elif engine_id == 'indic_parler':
890 from integrations.service_tools.indic_parler_tool import indic_parler_synthesize # noqa: F401
891 available = True
892 elif engine_id in ('chatterbox_turbo', 'chatterbox_ml'):
893 from integrations.service_tools.chatterbox_tool import chatterbox_synthesize # noqa: F401
894 available = True
895 elif engine_id == 'f5_tts':
896 from integrations.service_tools.f5_tts_tool import f5_synthesize # noqa: F401
897 available = True
898 elif engine_id == 'kokoro':
899 from integrations.service_tools.kokoro_tool import kokoro_synthesize # noqa: F401
900 available = True
901 elif engine_id == 'melotts':
902 # `melotts` PyPI package ships the `melo` import root.
903 import importlib.util as _ils
904 available = _ils.find_spec('melo') is not None
905 elif engine_id == 'xtts_v2':
906 # `coqui-tts` PyPI package ships `from TTS.api import TTS`.
907 import importlib.util as _ils
908 available = _ils.find_spec('TTS') is not None
909 elif engine_id == 'mms_tts':
910 # transformers is bundled; check the VitsModel symbol so we
911 # detect outright-broken transformers installs early.
912 from transformers import VitsModel # noqa: F401
913 available = True
914 elif engine_id == 'makeittalk':
915 import os
916 available = bool(os.environ.get('MAKEITTALK_API_URL'))
917 except (ImportError, Exception):
918 available = False
920 _engine_available_cache[engine_id] = (available, now)
921 return available
924def _get_gpu_info() -> Dict[str, Any]:
925 """Get GPU info from VRAMManager (cached singleton)."""
926 try:
927 from integrations.service_tools.vram_manager import get_vram_manager
928 mgr = get_vram_manager()
929 return mgr.detect_gpu()
930 except (ImportError, Exception):
931 return {'cuda_available': False, 'total_gb': 0, 'free_gb': 0}
934def _can_fit_on_gpu(engine_id: str) -> bool: # TODO REFACTOR: remove — duplicates catalog.matches_compute()
935 """Check if this engine's model fits in available VRAM."""
936 spec = ENGINE_REGISTRY.get(engine_id)
937 if not spec or not spec.vram_key:
938 return False
939 try:
940 from integrations.service_tools.vram_manager import get_vram_manager
941 return get_vram_manager().can_fit(spec.vram_key)
942 except (ImportError, Exception):
943 return False
946def _get_compute_policy() -> Dict[str, Any]:
947 """Get user's compute policy (local_only / local_preferred / any)."""
948 try:
949 from integrations.agent_engine.compute_config import get_compute_policy
950 return get_compute_policy()
951 except (ImportError, Exception):
952 return {'compute_policy': 'local_preferred'}
955# ═══════════════════════════════════════════════════════════════
956# Hive Peer TTS Offload
957# ═══════════════════════════════════════════════════════════════
959def _find_hive_peer_for_tts(language: str) -> Optional[Dict[str, Any]]:
960 # TODO REFACTOR: move to orchestrator as find_peer_for(model_type, language) —
961 # hive peer offloading applies to all model types (STT, VLM, LLM), not just TTS.
962 """Find a hive peer with GPU that can serve TTS for this language.
964 Returns peer info dict or None.
965 """
966 try:
967 from integrations.agent_engine.compute_mesh_service import get_compute_mesh
968 mesh = get_compute_mesh()
969 if not mesh or not mesh.peers:
970 return None
972 for peer in mesh.peers.values():
973 if not peer.available_compute or peer.available_compute < 0.1:
974 continue
975 # Peer has GPU and capacity
976 caps = peer.capabilities or {}
977 if caps.get('gpu'):
978 return {
979 'peer_id': peer.peer_id,
980 'address': peer.address,
981 'latency_ms': peer.latency_ms or 500,
982 'gpu': caps.get('gpu', 'unknown'),
983 }
984 return None
985 except (ImportError, Exception):
986 return None
989def _offload_tts_to_peer(peer: Dict, text: str, language: str,
990 voice: Optional[str] = None) -> Optional[Dict]:
991 """Offload TTS synthesis to a hive peer via compute mesh (DRY — reuses mesh service)."""
992 try:
993 from integrations.agent_engine.compute_mesh_service import get_compute_mesh
994 mesh = get_compute_mesh()
995 if not mesh:
996 return None
997 result = mesh.offload_to_best_peer(
998 model_type='tts',
999 prompt=text,
1000 options={'language': language, 'voice': voice or 'default'},
1001 )
1002 if result and 'error' not in result:
1003 return result
1004 except (ImportError, Exception) as e:
1005 logger.debug("Hive TTS offload failed: %s", e)
1006 return None
1009# ═══════════════════════════════════════════════════════════════
1010# TTSRouter — the brain
1011# ═══════════════════════════════════════════════════════════════
1013class TTSRouter:
1014 """Smart TTS engine selector and dispatcher.
1016 Considers language, hardware, compute policy, latency, and hive peers
1017 to select the best engine for each synthesis request.
1018 """
1020 def select_engines( # TODO REFACTOR: remove — catalog.select_best() is the single selector.
1021 # Language preferences feed into catalog via populate_tts_catalog()'s language_priority.
1022 # Move _is_engine_installed() to catalog, _find_hive_peer to orchestrator.
1023 self,
1024 text: str,
1025 language: Optional[str] = None,
1026 voice: Optional[str] = None,
1027 urgency: str = 'normal',
1028 require_clone: bool = False,
1029 ) -> List[TTSCandidate]:
1030 """Select and rank TTS engines for the given request.
1032 Args:
1033 text: Text to synthesize
1034 language: ISO 639-1 code (auto-detected if None)
1035 voice: Voice reference (triggers clone-capable filter)
1036 urgency: 'instant' (fastest), 'normal', 'quality' (best quality)
1037 require_clone: Only return engines with voice cloning
1039 Returns:
1040 Ranked list of TTSCandidate (best first), never empty
1041 """
1042 # Step 1: Detect language
1043 lang = language or detect_language(text)
1044 lang = lang[:2].lower() # normalize to 2-char code
1046 # Step 2: Get preferred engines for this language
1047 preferred = LANG_ENGINE_PREFERENCE.get(lang, _DEFAULT_PREFERENCE)
1049 # Step 3: Gather constraints
1050 gpu_info = _get_gpu_info()
1051 has_gpu = gpu_info.get('cuda_available', False)
1052 policy = _get_compute_policy()
1053 compute_mode = policy.get('compute_policy', 'local_preferred')
1055 # Step 4: Score each candidate
1056 candidates: List[TTSCandidate] = []
1057 seen = set()
1059 for engine_id in preferred:
1060 if engine_id in seen:
1061 continue
1062 seen.add(engine_id)
1064 spec = ENGINE_REGISTRY.get(engine_id)
1065 if not spec:
1066 continue
1068 # Voice cloning filter
1069 if require_clone and not spec.voice_clone:
1070 continue
1072 warnings: List[str] = []
1074 # --- LOCAL availability ---
1075 if spec.device == TTSDevice.CLOUD:
1076 # Cloud engines: skip if local_only
1077 if compute_mode == 'local_only':
1078 continue
1079 if _is_engine_installed(engine_id):
1080 candidates.append(TTSCandidate(
1081 engine=spec,
1082 location=TTSLocation.CLOUD,
1083 device='cloud',
1084 estimated_latency_ms=spec.latency_cloud_ms,
1085 quality_score=spec.quality,
1086 ))
1087 continue
1089 if spec.device == TTSDevice.GPU_ONLY:
1090 if has_gpu and _can_fit_on_gpu(engine_id):
1091 if _is_engine_installed(engine_id):
1092 candidates.append(TTSCandidate(
1093 engine=spec,
1094 location=TTSLocation.LOCAL,
1095 device='gpu',
1096 estimated_latency_ms=spec.latency_gpu_ms,
1097 quality_score=spec.quality,
1098 ))
1099 continue
1101 # GPU engine not available locally — try hive peer
1102 if compute_mode != 'local_only':
1103 peer = _find_hive_peer_for_tts(lang)
1104 if peer:
1105 candidates.append(TTSCandidate(
1106 engine=spec,
1107 location=TTSLocation.HIVE_PEER,
1108 device='gpu',
1109 estimated_latency_ms=spec.latency_gpu_ms + peer['latency_ms'],
1110 quality_score=spec.quality * 0.95, # slight penalty for network
1111 peer_address=peer['address'],
1112 warnings=[f"Offloaded to hive peer {peer['peer_id']}"],
1113 ))
1114 continue
1116 if spec.device == TTSDevice.GPU_PREFERRED:
1117 if not _is_engine_installed(engine_id):
1118 continue
1119 if has_gpu and _can_fit_on_gpu(engine_id):
1120 candidates.append(TTSCandidate(
1121 engine=spec,
1122 location=TTSLocation.LOCAL,
1123 device='gpu',
1124 estimated_latency_ms=spec.latency_gpu_ms,
1125 quality_score=spec.quality,
1126 ))
1127 else:
1128 # CPU fallback
1129 candidates.append(TTSCandidate(
1130 engine=spec,
1131 location=TTSLocation.LOCAL,
1132 device='cpu',
1133 estimated_latency_ms=spec.latency_cpu_ms,
1134 quality_score=spec.quality * 0.9, # CPU quality slightly lower
1135 warnings=['Running on CPU (slower, install GPU for better perf)'],
1136 ))
1137 continue
1139 if spec.device == TTSDevice.CPU_ONLY:
1140 if _is_engine_installed(engine_id):
1141 candidates.append(TTSCandidate(
1142 engine=spec,
1143 location=TTSLocation.LOCAL,
1144 device='cpu',
1145 estimated_latency_ms=spec.latency_cpu_ms,
1146 quality_score=spec.quality,
1147 ))
1148 continue
1150 # Step 5: Always ensure espeak as ultimate fallback
1151 if not any(c.engine.engine_id == 'espeak' for c in candidates):
1152 espeak_spec = ENGINE_REGISTRY['espeak']
1153 candidates.append(TTSCandidate(
1154 engine=espeak_spec,
1155 location=TTSLocation.LOCAL,
1156 device='cpu',
1157 estimated_latency_ms=10,
1158 quality_score=espeak_spec.quality,
1159 warnings=['Fallback: no neural TTS available for this language'],
1160 ))
1162 # Step 6: Sort by urgency-weighted score
1163 if urgency == 'instant':
1164 # Minimize latency — instant response
1165 candidates.sort(key=lambda c: (c.estimated_latency_ms, -c.quality_score))
1166 elif urgency == 'quality':
1167 # Maximize quality — don't care about latency
1168 candidates.sort(key=lambda c: (-c.quality_score, c.estimated_latency_ms))
1169 else:
1170 # Balance: quality * 0.6 + inverse_latency * 0.4
1171 max_latency = max(c.estimated_latency_ms for c in candidates) or 1
1172 candidates.sort(key=lambda c: -(
1173 c.quality_score * 0.6 +
1174 (1 - c.estimated_latency_ms / max_latency) * 0.4
1175 ))
1177 return candidates
1179 def synthesize(
1180 self,
1181 text: str,
1182 language: Optional[str] = None,
1183 voice: Optional[str] = None,
1184 output_path: Optional[str] = None,
1185 source: Optional[str] = None,
1186 urgency: str = 'normal',
1187 engine_override: Optional[str] = None,
1188 ) -> TTSResult:
1189 """Synthesize text using the best available TTS engine.
1191 Tries engines in ranked order until one succeeds.
1193 Args:
1194 text: Text to synthesize
1195 language: ISO 639-1 code (auto-detected if None)
1196 voice: Voice reference for cloning (path or saved name)
1197 output_path: Where to write WAV (auto-generated if None)
1198 source: Context hint (e.g. 'chat_response', 'greeting') —
1199 auto-maps to urgency via SOURCE_URGENCY
1200 urgency: 'instant' | 'normal' | 'quality' (used if source not set)
1201 engine_override: Force a specific engine (bypasses selection)
1203 Returns:
1204 TTSResult with synthesis details
1205 """
1206 # Auto-infer urgency from source hint
1207 if source:
1208 urgency = SOURCE_URGENCY.get(source, urgency)
1209 if not text or not text.strip():
1210 return TTSResult(
1211 path='', duration=0, engine_id='none', device='none',
1212 location='none', latency_ms=0, sample_rate=0, voice='',
1213 quality_score=0, error='Text is required',
1214 )
1216 lang = language or detect_language(text)
1218 # Normalize numbers, currency, URLs, units to spoken form BEFORE
1219 # engine selection — every TTS engine benefits (single converging
1220 # path). Latency-sensitive ('instant' urgency) skips the LLM
1221 # fallback but keeps the fast rule pass.
1222 try:
1223 from integrations.channels.media.tts_text_normalizer import (
1224 normalize_for_tts,
1225 )
1226 text = normalize_for_tts(
1227 text, lang, use_llm=(urgency != 'instant'),
1228 )
1229 except Exception as _e: # never let normalization block synthesis
1230 logger.debug(f'tts normalization skipped: {_e}')
1232 require_clone = voice is not None and voice not in ('default', '', None)
1234 # Engine override
1235 if engine_override and engine_override in ENGINE_REGISTRY:
1236 spec = ENGINE_REGISTRY[engine_override]
1237 candidates = [TTSCandidate(
1238 engine=spec,
1239 location=TTSLocation.LOCAL,
1240 device='gpu' if spec.device in (TTSDevice.GPU_ONLY, TTSDevice.GPU_PREFERRED) else 'cpu',
1241 estimated_latency_ms=spec.latency_gpu_ms or spec.latency_cpu_ms,
1242 quality_score=spec.quality,
1243 )]
1244 else:
1245 candidates = self.select_engines(
1246 text, lang, voice, urgency, require_clone,
1247 )
1249 # Try each candidate in order
1250 all_warnings = []
1251 for candidate in candidates:
1252 t0 = time.time()
1253 try:
1254 result = self._execute(candidate, text, lang, voice, output_path)
1255 elapsed = (time.time() - t0) * 1000
1256 if result and not result.get('error'):
1257 all_warnings.extend(candidate.warnings)
1258 return TTSResult(
1259 path=result.get('path', ''),
1260 duration=result.get('duration', 0),
1261 engine_id=candidate.engine.engine_id,
1262 device=candidate.device,
1263 location=candidate.location.value,
1264 latency_ms=round(elapsed, 1),
1265 sample_rate=result.get('sample_rate', candidate.engine.sample_rate),
1266 voice=result.get('voice', voice or 'default'),
1267 quality_score=candidate.quality_score,
1268 warnings=all_warnings,
1269 )
1270 else:
1271 err = result.get('error', 'unknown') if result else 'no result'
1272 all_warnings.append(
1273 f"{candidate.engine.engine_id} failed: {err}"
1274 )
1275 except Exception as e:
1276 all_warnings.append(f"{candidate.engine.engine_id} error: {e}")
1277 logger.debug("TTS engine %s failed: %s", candidate.engine.engine_id, e)
1279 # All engines failed
1280 return TTSResult(
1281 path='', duration=0, engine_id='none', device='none',
1282 location='none', latency_ms=0, sample_rate=0, voice='',
1283 quality_score=0, warnings=all_warnings,
1284 error='All TTS engines failed',
1285 )
1287 def _execute(
1288 self, candidate: TTSCandidate, text: str,
1289 language: str, voice: Optional[str], output_path: Optional[str],
1290 ) -> Optional[Dict[str, Any]]:
1291 """Execute TTS on a specific candidate engine."""
1293 # Hive peer offload
1294 if candidate.location == TTSLocation.HIVE_PEER:
1295 peer_info = {
1296 'address': candidate.peer_address,
1297 'peer_id': 'hive',
1298 'latency_ms': candidate.estimated_latency_ms,
1299 }
1300 result = _offload_tts_to_peer(peer_info, text, language, voice)
1301 return result
1303 # Cloud (MakeItTalk)
1304 if candidate.location == TTSLocation.CLOUD:
1305 return self._execute_makeittalk(text, voice)
1307 # Local engine
1308 engine_id = candidate.engine.engine_id
1309 spec = candidate.engine
1311 if engine_id == 'luxtts':
1312 return self._call_luxtts(text, voice, output_path, candidate.device)
1313 elif engine_id == 'pocket_tts':
1314 return self._call_pocket_tts(text, voice, output_path)
1315 elif engine_id == 'espeak':
1316 return self._call_espeak(text, language, output_path)
1317 elif engine_id == 'cosyvoice3':
1318 return self._call_gpu_engine(
1319 'integrations.service_tools.cosyvoice_tool',
1320 'cosyvoice_synthesize',
1321 text, language, voice, output_path,
1322 )
1323 elif engine_id == 'indic_parler':
1324 return self._call_gpu_engine(
1325 'integrations.service_tools.indic_parler_tool',
1326 'indic_parler_synthesize',
1327 text, language, voice, output_path,
1328 )
1329 elif engine_id == 'chatterbox_turbo':
1330 return self._call_gpu_engine(
1331 'integrations.service_tools.chatterbox_tool',
1332 'chatterbox_synthesize',
1333 text, language, voice, output_path,
1334 )
1335 elif engine_id == 'chatterbox_ml':
1336 return self._call_gpu_engine(
1337 'integrations.service_tools.chatterbox_tool',
1338 'chatterbox_ml_synthesize',
1339 text, language, voice, output_path,
1340 )
1341 elif engine_id == 'f5_tts':
1342 return self._call_gpu_engine(
1343 'integrations.service_tools.f5_tts_tool',
1344 'f5_synthesize',
1345 text, language, voice, output_path,
1346 )
1347 elif engine_id == 'kokoro':
1348 return self._call_gpu_engine(
1349 'integrations.service_tools.kokoro_tool',
1350 'kokoro_synthesize',
1351 text, language, voice, output_path,
1352 )
1353 elif engine_id == 'melotts':
1354 return self._call_gpu_engine(
1355 'integrations.service_tools.melotts_tool',
1356 'melotts_synthesize',
1357 text, language, voice, output_path,
1358 )
1359 elif engine_id == 'xtts_v2':
1360 return self._call_gpu_engine(
1361 'integrations.service_tools.xtts_tool',
1362 'xtts_synthesize',
1363 text, language, voice, output_path,
1364 )
1365 elif engine_id == 'mms_tts':
1366 return self._call_gpu_engine(
1367 'integrations.service_tools.mms_tts_tool',
1368 'mms_tts_synthesize',
1369 text, language, voice, output_path,
1370 )
1371 return {'error': f'Unknown engine: {engine_id}'}
1373 def _call_luxtts(self, text, voice, output_path, device):
1374 from integrations.service_tools.luxtts_tool import luxtts_synthesize
1375 result_str = luxtts_synthesize(
1376 text, voice_audio=voice, output_path=output_path, device=device,
1377 )
1378 return json.loads(result_str)
1380 def _call_pocket_tts(self, text, voice, output_path):
1381 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize
1382 voice_name = voice if voice and voice != 'default' else 'alba'
1383 result_str = pocket_tts_synthesize(text, voice_name, output_path)
1384 return json.loads(result_str)
1386 def _call_espeak(self, text, language, output_path):
1387 """Call espeak-ng via pocket_tts_tool (DRY — reuses existing impl)."""
1388 import os
1390 if not output_path:
1391 out_dir = os.environ.get('TTS_TEMP_DIR', '/tmp/tts')
1392 os.makedirs(out_dir, exist_ok=True)
1393 output_path = os.path.join(out_dir, f'espeak_{int(time.time()*1000)}.wav')
1395 try:
1396 from integrations.service_tools.pocket_tts_tool import _espeak_synthesize
1397 espeak_lang = language if language else 'en'
1398 if _espeak_synthesize(text[:5000], output_path, voice=espeak_lang):
1399 return {
1400 'path': output_path,
1401 'duration': len(text.split()) / 150 * 60, # estimate
1402 'sample_rate': 22050,
1403 'voice': espeak_lang,
1404 'engine': 'espeak-ng',
1405 }
1406 return {'error': 'espeak-ng not installed'}
1407 except (ImportError, Exception):
1408 return {'error': 'espeak-ng not available'}
1410 def _call_gpu_engine(self, module_path, function_name, text, language,
1411 voice, output_path):
1412 """Generic caller for GPU TTS service tools."""
1413 import importlib
1414 try:
1415 mod = importlib.import_module(module_path)
1416 fn = getattr(mod, function_name)
1417 result_str = fn(text, language=language, voice=voice,
1418 output_path=output_path)
1419 return json.loads(result_str)
1420 except ImportError as e:
1421 return {'error': f'{module_path} not installed: {e}'}
1422 except Exception as e:
1423 return {'error': str(e)}
1425 def _execute_makeittalk(self, text, voice):
1426 """Cloud TTS via MakeItTalk API."""
1427 import os
1428 base_url = os.environ.get('MAKEITTALK_API_URL')
1429 if not base_url:
1430 return {'error': 'MAKEITTALK_API_URL not set'}
1431 try:
1432 import requests
1433 resp = requests.post(
1434 f"{base_url}/video-gen/",
1435 json={
1436 'text': text,
1437 'voiceName': voice or 'af_bella',
1438 'audio_only': True,
1439 },
1440 timeout=30,
1441 )
1442 if resp.status_code == 200:
1443 data = resp.json()
1444 audio_url = data.get('audio_url') or data.get('url', '')
1445 return {
1446 'path': audio_url,
1447 'duration': data.get('duration', 0),
1448 'voice': voice or 'af_bella',
1449 'engine': 'makeittalk',
1450 'sample_rate': 24000,
1451 }
1452 return {'error': f'MakeItTalk HTTP {resp.status_code}'}
1453 except Exception as e:
1454 return {'error': f'MakeItTalk: {e}'}
1456 def get_engine_status(self) -> List[Dict[str, Any]]:
1457 """Report status of all TTS engines for diagnostics."""
1458 gpu_info = _get_gpu_info()
1459 has_gpu = gpu_info.get('cuda_available', False)
1460 statuses = []
1462 for eid, spec in ENGINE_REGISTRY.items():
1463 installed = _is_engine_installed(eid)
1464 can_run = False
1465 device = 'n/a'
1467 if spec.device == TTSDevice.CPU_ONLY:
1468 can_run = installed
1469 device = 'cpu'
1470 elif spec.device == TTSDevice.GPU_ONLY:
1471 can_run = installed and has_gpu and _can_fit_on_gpu(eid)
1472 device = 'gpu' if can_run else 'n/a'
1473 elif spec.device == TTSDevice.GPU_PREFERRED:
1474 can_run = installed
1475 device = 'gpu' if (has_gpu and _can_fit_on_gpu(eid)) else 'cpu'
1476 elif spec.device == TTSDevice.CLOUD:
1477 can_run = installed
1478 device = 'cloud'
1480 statuses.append({
1481 'engine': eid,
1482 'installed': installed,
1483 'can_run': can_run,
1484 'device': device,
1485 'languages': list(spec.languages),
1486 'quality': spec.quality,
1487 'voice_clone': spec.voice_clone,
1488 'vram_gb': spec.vram_key,
1489 })
1491 return statuses
1493 def get_all_voices(self) -> List[Dict[str, Any]]:
1494 """Aggregate available voices from all installed TTS engines."""
1495 voices: List[Dict[str, Any]] = []
1496 try:
1497 from integrations.service_tools.pocket_tts_tool import (
1498 _BUILTIN_VOICES,
1499 )
1500 for v in _BUILTIN_VOICES:
1501 voices.append({'id': v, 'engine': 'pocket_tts', 'type': 'builtin'})
1502 except (ImportError, Exception):
1503 pass
1504 try:
1505 from integrations.service_tools.luxtts_tool import luxtts_list_voices
1506 import json as _json
1507 result = _json.loads(luxtts_list_voices())
1508 for v in result.get('voices', []):
1509 voices.append({'id': v.get('id', ''), 'engine': 'luxtts', 'type': 'cloned'})
1510 except (ImportError, Exception):
1511 pass
1512 return voices
1515# ═══════════════════════════════════════════════════════════════
1516# Singleton
1517# ═══════════════════════════════════════════════════════════════
1519_router_instance: Optional[TTSRouter] = None
1522def get_tts_router() -> TTSRouter:
1523 """Get the singleton TTS router."""
1524 global _router_instance
1525 if _router_instance is None:
1526 _router_instance = TTSRouter()
1527 return _router_instance
1530# ═══════════════════════════════════════════════════════════════
1531# ModelCatalog integration — populate_tts_catalog()
1532# ═══════════════════════════════════════════════════════════════
1534# Reflection-dispatch contract for catalog entries that have NO
1535# `tool_module` (pure-JSON model registration via admin UI / hive
1536# federation / model_catalog.json edit). An entry without `tool_module`
1537# MUST declare every field below in its `capabilities` dict — otherwise
1538# the dispatcher has no way to know how to instantiate the class, marshal
1539# the request, or normalize the return. See task #58 for the full
1540# rationale; the schema is finalized at 5 fields, no more.
1541_REFLECTION_FIELDS: Tuple[str, ...] = (
1542 'import_path', # 'pkg.module:ClassName'
1543 'init_args', # dict — kwargs for ClassName(**init_args); {} OK
1544 'synth_method', # str — instance method name
1545 'params_map', # dict — {payload_key → method_kwarg}
1546 'output_format', # canonical id (see _OUTPUT_FORMATS below)
1547)
1549# Canonical return-shape identifiers the reflection dispatcher knows
1550# how to normalize into a wire-format wav (or path). Engines that
1551# return shapes outside this set MUST use the `tool_module` escape
1552# hatch instead — the dispatcher won't guess.
1553_OUTPUT_FORMATS: Tuple[str, ...] = (
1554 'wav_bytes', # bytes object holding a WAV-formatted byte stream
1555 'numpy_24k', # 1-D float32 numpy array @ 24 kHz mono
1556 'file_path', # str path to a wav file the engine wrote
1557 'bytesio', # io.BytesIO containing wav bytes
1558)
1561def _validate_engine_caps(caps: Dict[str, Any]) -> Optional[str]:
1562 """Validate a TTS catalog entry's capabilities dict.
1564 Returns None when the entry is dispatchable, OR a human-readable
1565 error string when it is not. Two valid shapes:
1567 1. Python-tool path (escape hatch):
1568 caps['tool_module'] = 'pkg.module' # required
1569 The entry will be dispatched via the existing
1570 `gpu_worker._dispatch_and_run` path: import the module, pick
1571 up `_load[_<variant>]` / `_synthesize[_<variant>]` callbacks
1572 by convention. This is what every code-shipped engine in
1573 ENGINE_REGISTRY uses today.
1575 2. Pure-config / reflection path:
1576 caps lacks tool_module BUT declares ALL of _REFLECTION_FIELDS.
1577 The dispatcher will use reflection to instantiate the class
1578 and call the synth method — no .py file needed for adding
1579 new models that fit a homogeneous load+method API (Kokoro,
1580 Pocket-TTS, etc., evaluated empirically per engine).
1582 Validation fires at INGEST time (populate_tts_catalog upsert path
1583 AND _catalog_entry_to_spec read path) so a malformed entry cannot
1584 reach the dispatcher. This guards against the "user discovers the
1585 error only when they request the voice" failure mode.
1586 """
1587 if not isinstance(caps, dict):
1588 return f'capabilities must be a dict, got {type(caps).__name__}'
1590 if caps.get('tool_module'):
1591 # Python-tool entry — tool_module on its own is sufficient. The
1592 # dispatcher will pick up _load / _synthesize via convention.
1593 return None
1595 # Reflection entry — every field is required. No partial schemas.
1596 missing = [f for f in _REFLECTION_FIELDS if f not in caps]
1597 if missing:
1598 return (
1599 f'entry has no tool_module and is missing reflection fields '
1600 f'{missing}; reflection dispatch needs the full 5-field '
1601 f'contract: {list(_REFLECTION_FIELDS)}'
1602 )
1604 # Cheap shape sanity — early-fail with a precise message rather than
1605 # let the dispatcher trip on a bad type at synth time.
1606 if not isinstance(caps.get('init_args'), dict):
1607 return f'init_args must be a dict, got {type(caps.get("init_args")).__name__}'
1608 if not isinstance(caps.get('params_map'), dict):
1609 return f'params_map must be a dict, got {type(caps.get("params_map")).__name__}'
1610 if not isinstance(caps.get('synth_method'), str) or not caps['synth_method']:
1611 return 'synth_method must be a non-empty str'
1612 if not isinstance(caps.get('import_path'), str) or ':' not in caps['import_path']:
1613 return (
1614 f'import_path must be "pkg.module:ClassName", got '
1615 f'{caps.get("import_path")!r}'
1616 )
1617 if caps.get('output_format') not in _OUTPUT_FORMATS:
1618 return (
1619 f'output_format must be one of {list(_OUTPUT_FORMATS)}, got '
1620 f'{caps.get("output_format")!r}'
1621 )
1622 return None
1625# Human-readable display names for each engine (used in admin UI)
1626_ENGINE_DISPLAY_NAMES: Dict[str, str] = {
1627 'chatterbox_turbo': 'Chatterbox Turbo (GPU, English, voice-clone)',
1628 'luxtts': 'LuxTTS (CPU, English, voice-clone)',
1629 'cosyvoice3': 'CosyVoice 3 (GPU, multilingual, voice-clone)',
1630 'f5_tts': 'F5-TTS (GPU, EN/ZH, voice-clone)',
1631 'indic_parler': 'Indic Parler-TTS (GPU, 22 Indic languages)',
1632 'chatterbox_ml': 'Chatterbox Multilingual (GPU, 23 languages, voice-clone)',
1633 'pocket_tts': 'Pocket TTS (CPU, English, voice-clone)',
1634 'kokoro': 'Kokoro 82M (CPU/GPU, English, neural)',
1635 'espeak': 'eSpeak-NG (CPU, 100+ languages, instant fallback)',
1636 'makeittalk': 'MakeItTalk (Cloud, English)',
1637 'melotts': 'MeloTTS (CPU/GPU, 6 langs, neural)',
1638 'xtts_v2': 'XTTS-v2 (GPU, 17 langs, voice-clone)',
1639 'mms_tts': 'MMS-TTS (CPU/GPU, 50+ langs via VITS)',
1640}
1642# Extra capabilities per engine that don't map 1-to-1 onto TTSEngineSpec fields
1643_ENGINE_EXTRA_CAPS: Dict[str, Dict[str, Any]] = {
1644 'chatterbox_turbo': {
1645 'streaming': False,
1646 'paralinguistic': ['emotion_happy', 'emotion_sad', 'emotion_angry',
1647 'emotion_surprised', 'laughing', 'whispering'],
1648 'emotion_tags': True,
1649 },
1650 'luxtts': {
1651 'streaming': False,
1652 'paralinguistic': [],
1653 'emotion_tags': False,
1654 },
1655 'cosyvoice3': {
1656 'streaming': True,
1657 'paralinguistic': ['emotion_happy', 'emotion_sad', 'whispering'],
1658 'emotion_tags': True,
1659 },
1660 'f5_tts': {
1661 'streaming': False,
1662 'paralinguistic': [],
1663 'emotion_tags': False,
1664 },
1665 'indic_parler': {
1666 'streaming': False,
1667 'paralinguistic': [],
1668 'emotion_tags': False,
1669 },
1670 'chatterbox_ml': {
1671 'streaming': False,
1672 'paralinguistic': ['emotion_happy', 'emotion_sad', 'whispering'],
1673 'emotion_tags': True,
1674 },
1675 'pocket_tts': {
1676 'streaming': False,
1677 'paralinguistic': [],
1678 'emotion_tags': False,
1679 },
1680 'kokoro': {
1681 'streaming': False,
1682 'paralinguistic': [],
1683 'emotion_tags': False,
1684 },
1685 'espeak': {
1686 'streaming': False,
1687 'paralinguistic': [],
1688 'emotion_tags': False,
1689 },
1690 'makeittalk': {
1691 'streaming': False,
1692 'paralinguistic': [],
1693 'emotion_tags': False,
1694 },
1695 'melotts': {
1696 'streaming': False,
1697 'paralinguistic': [],
1698 'emotion_tags': False,
1699 },
1700 'xtts_v2': {
1701 'streaming': False,
1702 'paralinguistic': [],
1703 'emotion_tags': False,
1704 },
1705 'mms_tts': {
1706 'streaming': False,
1707 'paralinguistic': [],
1708 'emotion_tags': False,
1709 },
1710}
1712# Device → backend string mapping for ModelEntry.backend field
1713_DEVICE_TO_BACKEND: Dict[str, str] = {
1714 TTSDevice.GPU_ONLY.value: 'torch',
1715 TTSDevice.GPU_PREFERRED.value: 'torch',
1716 TTSDevice.CPU_ONLY.value: 'in_process',
1717 TTSDevice.CLOUD.value: 'api',
1718}
1720# Device → supports_gpu / supports_cpu flags
1721_DEVICE_TO_COMPUTE: Dict[str, Tuple[bool, bool]] = {
1722 # (supports_gpu, supports_cpu)
1723 TTSDevice.GPU_ONLY.value: (True, False),
1724 TTSDevice.GPU_PREFERRED.value: (True, True),
1725 TTSDevice.CPU_ONLY.value: (False, True),
1726 TTSDevice.CLOUD.value: (False, False),
1727}
1729# DEPRECATED: VRAM specs now live in vram_manager.VRAM_BUDGETS (single
1730# source of truth). Use _engine_vram_gb(engine_id) helper below.
1731# This dict is kept for backward compatibility but should NOT be edited.
1732_ENGINE_VRAM_GB: Dict[str, float] = {} # populated lazily by _engine_vram_gb
1735def _engine_vram_gb(engine_id: str) -> float:
1736 """Single source of truth for engine VRAM requirement.
1738 Reads from vram_manager.VRAM_BUDGETS — the canonical specs.
1739 The vram_manager key convention is 'tts_<engine_id>' (e.g. 'tts_indic_parler').
1740 Returns 0.0 only if engine has no GPU requirement (CPU-only engine).
1741 Logs a warning if engine is GPU-capable but missing from VRAM_BUDGETS
1742 (catches drift between the two registries).
1743 """
1744 if engine_id in _ENGINE_VRAM_GB:
1745 return _ENGINE_VRAM_GB[engine_id]
1746 try:
1747 from integrations.service_tools.vram_manager import VRAM_BUDGETS
1748 key = f'tts_{engine_id}'
1749 if key in VRAM_BUDGETS:
1750 vram = VRAM_BUDGETS[key][0] # (gpu_gb, cpu_gb)
1751 _ENGINE_VRAM_GB[engine_id] = vram
1752 return vram
1753 # Engine not registered in vram_manager — log once, assume CPU
1754 logger.debug(
1755 "TTS engine %r has no VRAM_BUDGETS entry (key=%r) — "
1756 "assuming CPU-only. Add to vram_manager.VRAM_BUDGETS if GPU-capable.",
1757 engine_id, key,
1758 )
1759 except ImportError:
1760 logger.debug("vram_manager unavailable, assuming CPU-only for %r", engine_id)
1761 _ENGINE_VRAM_GB[engine_id] = 0.0
1762 return 0.0
1764# Approximate disk footprint per engine (GB)
1765_ENGINE_DISK_GB: Dict[str, float] = {
1766 'chatterbox_turbo': 2.0,
1767 'luxtts': 0.5,
1768 'cosyvoice3': 3.5,
1769 'f5_tts': 2.5,
1770 'indic_parler': 4.0,
1771 'chatterbox_ml': 3.0,
1772 'pocket_tts': 0.1,
1773 'espeak': 0.05,
1774 'makeittalk': 0.0,
1775 'melotts': 1.5, # 6 per-lang checkpoints, ~250 MB each
1776 'xtts_v2': 2.0, # weights + speakers + config
1777 'mms_tts': 0.2, # ~150 MB per lang lazy-downloaded
1778}
1780# Approximate RAM needed for CPU-capable engines (GB)
1781_ENGINE_RAM_GB: Dict[str, float] = {
1782 'chatterbox_turbo': 2.0,
1783 'luxtts': 2.0,
1784 'cosyvoice3': 4.0,
1785 'f5_tts': 2.0,
1786 'indic_parler': 4.0,
1787 'chatterbox_ml': 4.0,
1788 'pocket_tts': 0.5,
1789 'espeak': 0.1,
1790 'makeittalk': 0.1,
1791 'melotts': 2.0,
1792 'xtts_v2': 3.0,
1793 'mms_tts': 1.5,
1794}
1797def populate_tts_catalog(catalog) -> int:
1798 """Convert ENGINE_REGISTRY into ModelEntry objects and register them.
1800 Called by ModelCatalog.populate_from_subsystems() via the populator
1801 plugin mechanism — keeps tts_router as the single source of truth for
1802 TTS engine capabilities.
1804 Validation contract (#58): admin- or hive-supplied catalog entries
1805 that exist BEFORE this populator runs are validated against
1806 `_validate_engine_caps`. Invalid entries are removed from the
1807 catalog with a logged WARNING — they cannot reach the dispatcher.
1808 This is the "fail-fast at catalog ingest, not synth time" half of
1809 the contract; the other half (validation on every read) lives in
1810 `_catalog_entry_to_spec`.
1812 Args:
1813 catalog: ModelCatalog instance (accepts Any to avoid a hard import
1814 at module level — the catalog is passed in by the caller).
1816 Returns:
1817 Number of new entries added (skips already-registered IDs).
1818 """
1819 # Lazy import inside function body — avoids circular import at module load
1820 from integrations.service_tools.model_catalog import ModelEntry, ModelType
1822 # Pre-pass: validate any existing TTS entries (admin/hive seeded the
1823 # catalog before us). Invalid entries are removed + logged so they
1824 # don't poison `_refresh_engine_registry_from_catalog` below. Code-
1825 # shipped engines (ENGINE_REGISTRY) ALWAYS have tool_module so they
1826 # never trip this; the gate exists for foreign manifests.
1827 _drop_ids: List[str] = []
1828 for entry in list(catalog.list_by_type('tts')):
1829 err = _validate_engine_caps(entry.capabilities or {})
1830 if err:
1831 logger.warning(
1832 'TTS catalog entry %r rejected at ingest: %s', entry.id, err,
1833 )
1834 _drop_ids.append(entry.id)
1835 # #58 Scope-2 (2026-05-07): reflection-only entries (caps lack
1836 # tool_module but declare the full 5-field contract) are now
1837 # dispatchable via `gpu_worker._dispatch_catalog_id` (`python -m
1838 # gpu_worker --catalog-id <id>`). They survive ingest as long as
1839 # `_validate_engine_caps` passes; they are EXCLUDED from the
1840 # ENGINE_REGISTRY snapshot by `_refresh_engine_registry_from_catalog`
1841 # because TTSEngineSpec carries `tool_module` as a non-optional
1842 # dispatch handle for the existing call sites. The catalog reads
1843 # them via the --catalog-id path instead.
1844 for _eid in _drop_ids:
1845 try:
1846 catalog.unregister(_eid, persist=False)
1847 except Exception as _re:
1848 logger.debug('failed to unregister invalid TTS entry %r: %s',
1849 _eid, _re)
1851 added = 0
1852 for engine_id, spec in ENGINE_REGISTRY.items():
1853 # Skip if already registered (preserves user edits from admin UI)
1854 if catalog.get(f'tts-{engine_id.replace("_", "-")}') is not None:
1855 continue
1857 device_value = spec.device.value
1858 supports_gpu, supports_cpu = _DEVICE_TO_COMPUTE.get(
1859 device_value, (False, True)
1860 )
1861 backend = _DEVICE_TO_BACKEND.get(device_value, 'in_process')
1863 # Build language_priority from LANG_ENGINE_PREFERENCE:
1864 # lower rank in the preference list → lower priority number → preferred
1865 lang_priority: Dict[str, int] = {}
1866 for lang, engine_list in LANG_ENGINE_PREFERENCE.items():
1867 if engine_id in engine_list:
1868 rank = engine_list.index(engine_id) # 0 = most preferred
1869 lang_priority[lang] = rank * 10 # 0, 10, 20, ...
1871 # Pick the best latency figure for quality/speed scores
1872 best_latency_ms = min(
1873 (v for v in (spec.latency_gpu_ms, spec.latency_cpu_ms,
1874 spec.latency_cloud_ms) if v > 0),
1875 default=5000,
1876 )
1877 # speed_score: 1.0 = instant (≤10 ms), 0.0 = very slow (≥5000 ms)
1878 speed_score = max(0.0, 1.0 - (best_latency_ms - 10) / 4990)
1880 # Build capabilities dict — TTS-specific fields + extras
1881 extra = _ENGINE_EXTRA_CAPS.get(engine_id, {})
1882 capabilities: Dict[str, Any] = {
1883 'voice_clone': spec.voice_clone,
1884 'sample_rate': spec.sample_rate,
1885 'latency_gpu_ms': spec.latency_gpu_ms,
1886 'latency_cpu_ms': spec.latency_cpu_ms,
1887 'latency_cloud_ms': spec.latency_cloud_ms,
1888 'tool_module': spec.tool_module,
1889 'tool_function': spec.tool_function,
1890 'vram_key': spec.vram_key,
1891 'streaming': extra.get('streaming', False),
1892 'paralinguistic': extra.get('paralinguistic', []),
1893 'emotion_tags': extra.get('emotion_tags', False),
1894 }
1896 # languages list — ('*',) means "all"; store as-is so select_best
1897 # language matching still works (catalog treats '*' as wildcard)
1898 languages = list(spec.languages)
1900 entry = ModelEntry(
1901 id=f'tts-{engine_id.replace("_", "-")}',
1902 name=_ENGINE_DISPLAY_NAMES.get(engine_id, engine_id),
1903 model_type=ModelType.TTS,
1904 version='1.0',
1905 source='cloud' if spec.device == TTSDevice.CLOUD else 'local',
1906 vram_gb=_engine_vram_gb(engine_id),
1907 ram_gb=_ENGINE_RAM_GB.get(engine_id, 0.5),
1908 disk_gb=_ENGINE_DISK_GB.get(engine_id, 0.0),
1909 min_capability_tier='lite' if supports_cpu else 'standard',
1910 backend=backend,
1911 supports_gpu=supports_gpu,
1912 supports_cpu=supports_cpu,
1913 supports_cpu_offload=False,
1914 idle_timeout_s=300.0,
1915 capabilities=capabilities,
1916 quality_score=spec.quality,
1917 speed_score=round(speed_score, 3),
1918 priority=50,
1919 languages=languages,
1920 language_priority=lang_priority,
1921 tags=['tts', 'local' if spec.device != TTSDevice.CLOUD else 'cloud'],
1922 enabled=True,
1923 auto_load=False,
1924 )
1925 catalog.register(entry, persist=False)
1926 added += 1
1928 # Post-upsert: rebuild ENGINE_REGISTRY in place so it reflects the
1929 # current catalog state (admin/hive-edited entries become visible
1930 # to existing call sites). Snapshot semantics — runtime catalog
1931 # mutations after this point do NOT auto-propagate; a re-bootstrap
1932 # is required. Matches the dict-iter assumption every existing
1933 # ENGINE_REGISTRY caller relies on. See task #58 acceptance #5.
1934 _refresh_engine_registry_from_catalog(catalog)
1936 return added
1939def _refresh_engine_registry_from_catalog(catalog) -> int:
1940 """Rebuild ENGINE_REGISTRY in place from the post-upsert catalog.
1942 Reflection-only entries (no tool_module) are excluded — they live
1943 only in the catalog and are dispatched via the `--catalog-id`
1944 path. TTSEngineSpec callers continue to see only spec-shaped
1945 entries, exactly as before this refactor.
1947 Returns the number of entries in the rebuilt registry.
1949 Idempotent: calling twice with the same catalog state produces the
1950 same registry contents.
1951 """
1952 new_entries: Dict[str, TTSEngineSpec] = {}
1953 for entry in catalog.list_by_type('tts'):
1954 spec = _catalog_entry_to_spec(entry)
1955 if spec is None:
1956 continue # validation failed, or reflection-only entry
1957 new_entries[spec.engine_id] = spec
1958 ENGINE_REGISTRY.clear()
1959 ENGINE_REGISTRY.update(new_entries)
1960 return len(new_entries)
1963def _catalog_entry_to_spec(entry) -> Optional[TTSEngineSpec]:
1964 """Convert a ModelCatalog ModelEntry back to a TTSEngineSpec.
1966 Used by code that needs a TTSEngineSpec but only has a catalog entry
1967 (e.g. when the router consults the catalog for dynamically registered
1968 engines that were not present in ENGINE_REGISTRY at startup).
1970 Returns None if:
1971 * the entry's capabilities fail validation (#58 contract — see
1972 `_validate_engine_caps`); the caller should NOT see that entry
1973 because the dispatcher cannot route to it.
1974 * the entry uses the reflection-only dispatch path (no tool_module).
1975 TTSEngineSpec carries `tool_module` as a non-optional dispatch
1976 handle for the existing call sites; reflection-only entries are
1977 dispatched directly from the catalog and are intentionally
1978 excluded from the ENGINE_REGISTRY snapshot.
1979 """
1980 caps = entry.capabilities or {}
1981 err = _validate_engine_caps(caps)
1982 if err:
1983 # Loud at ingest, silent on subsequent re-reads — the catalog
1984 # populator/loader already logged this; don't spam every read.
1985 return None
1986 tool_module = caps.get('tool_module')
1987 if not tool_module:
1988 # Valid reflection-only entry, but TTSEngineSpec needs a
1989 # tool_module. Caller (`_refresh_engine_registry_from_catalog`)
1990 # will skip None entries and dispatch reflection-only IDs via
1991 # the catalog path instead.
1992 return None
1993 tool_function = caps.get('tool_function')
1995 # Determine TTSDevice from backend + supports_* flags
1996 if caps.get('latency_cloud_ms', 0) > 0 and not entry.supports_gpu and not entry.supports_cpu:
1997 device = TTSDevice.CLOUD
1998 elif entry.supports_gpu and not entry.supports_cpu:
1999 device = TTSDevice.GPU_ONLY
2000 elif entry.supports_gpu and entry.supports_cpu:
2001 device = TTSDevice.GPU_PREFERRED
2002 else:
2003 device = TTSDevice.CPU_ONLY
2005 # Strip the 'tts-' prefix that populate_tts_catalog adds
2006 raw_id = entry.id[4:] if entry.id.startswith('tts-') else entry.id
2008 return TTSEngineSpec(
2009 engine_id=raw_id,
2010 device=device,
2011 vram_key=caps.get('vram_key', ''),
2012 languages=tuple(entry.languages) if entry.languages else ('en',),
2013 quality=entry.quality_score,
2014 voice_clone=caps.get('voice_clone', False),
2015 latency_gpu_ms=caps.get('latency_gpu_ms', 0),
2016 latency_cpu_ms=caps.get('latency_cpu_ms', 0),
2017 latency_cloud_ms=caps.get('latency_cloud_ms', 0),
2018 tool_module=tool_module,
2019 tool_function=tool_function,
2020 sample_rate=caps.get('sample_rate', 24000),
2021 )