Coverage for integrations/channels/media/tts

1"""Smart TTS Router — selects the best TTS engine based on constraints.

3Decision factors (in priority order):

41. Language — which engines support the target language?

52. Availability — is the engine installed locally?

63. Hardware — GPU present? Enough VRAM? CPU-only fallback?

74. Compute policy — local_only | local_preferred | any (hive offload)

85. Latency — instant (espeak/browser) vs quality (neural)

96. Voice cloning — only clone-capable engines if voice requested

107. Hive peers — offload to GPU peer when local can't serve

11"""

13import json

14import logging

15import time

16from dataclasses import dataclass, field

17from enum import Enum

18from typing import Any, Dict, List, Optional, Tuple

20logger = logging.getLogger(__name__)

22# ═══════════════════════════════════════════════════════════════

23# Source → Urgency mapping (backend auto-infers, frontends send source)

24# ═══════════════════════════════════════════════════════════════

26SOURCE_URGENCY: Dict[str, str] = {

27 'chat_response': 'normal', # Agent reply in chat

28 'notification': 'instant', # System notification

29 'greeting': 'instant', # Boot/login greeting

30 'read_aloud': 'quality', # User clicked "speak this"

31 'channel': 'normal', # Discord/Telegram response

32 'cli': 'quality', # hart voice "text"

33 'agent_tool': 'normal', # Agent using TTS tool

34}

36# ═══════════════════════════════════════════════════════════════

37# Engine Registry — static capabilities of every TTS engine

38# ═══════════════════════════════════════════════════════════════

40class TTSDevice(Enum):

41 GPU_ONLY = "gpu_only"

42 GPU_PREFERRED = "gpu_preferred" # works on CPU too, GPU better

43 CPU_ONLY = "cpu_only"

44 CLOUD = "cloud"

47@dataclass(frozen=True)

48class TTSEngineSpec:

49 """Static specification of a TTS engine's capabilities."""

50 engine_id: str

51 device: TTSDevice

52 vram_key: str # key in VRAM_BUDGETS (vram_manager.py)

53 languages: Tuple[str, ...] # ISO 639-1 codes, or ('*',) for all

54 quality: float # 0.0-1.0 subjective quality score

55 voice_clone: bool

56 latency_gpu_ms: int # estimated latency on GPU (0 if N/A)

57 latency_cpu_ms: int # estimated latency on CPU (0 if N/A)

58 latency_cloud_ms: int # estimated latency on cloud (0 if N/A)

59 tool_module: Optional[str] # Python module path for the tool

60 tool_function: Optional[str] # parent-side synthesize function name

61 tool_worker_attr: Optional[str] = None # ToolWorker attribute name

62 # on the tool module; None for

63 # CPU-only engines that have no

64 # subprocess worker.

65 required_package: Optional[str] = None # pip package name that must be

66 # importable at runtime; None

67 # for engines whose deps are

68 # bundled (e.g. piper) or CPU-only

69 # with no extra deps.

70 pip_install_plan: Tuple[str, ...] = () # canonical pip-spec list to make

71 # `required_package` actually

72 # importable + synth-functional —

73 # includes transitive deps the

74 # upstream package may forget to

75 # declare in its install_requires

76 # (e.g. chatterbox-tts ships

77 # `import librosa` in tts.py but

78 # doesn't list librosa as a hard

79 # dep, so a no-deps pip install

80 # leaves a broken package on disk

81 # that imports far enough for

82 # find_spec() but blows up on

83 # actual synthesize calls — see

84 # ~/Documents/Nunba/logs/probe_

85 # chatterbox_turbo.err). Single

86 # source of truth for the desktop

87 # installer (Nunba) so it doesn't

88 # carry a parallel dict that

89 # drifts. Empty tuple = nothing

90 # to install (bundled / CPU stub).

91 install_target: str = 'main' # WHERE pip_install_plan should

92 # land on the desktop installer.

93 # Valid values:

94 # 'main' — into the main

95 # python-embed

96 # site-packages

97 # (legacy default;

98 # risky, dep

99 # conflicts mask

100 # silent failures)

101 # 'venv' — into a private

102 # venv at

103 # ~/Documents/

104 # Nunba/data/

105 # venvs/<engine>/.

106 # Requires a

107 # per-engine

108 # worker file

109 # (tts/<engine>_

110 # worker.py) that

111 # the parent

112 # dispatches into

113 # via backend_

114 # venv.invoke_in_

115 # venv().

116 # 'bundled' — already on

117 # disk via the

118 # frozen build

119 # (piper voices,

120 # luxtts, espeak)

121 # 'cloud' — HTTP-only,

122 # nothing to

123 # install

124 # (makeittalk)

125 # 'git_clone' — needs git clone

126 # of an upstream

127 # repo + pip

128 # install -e

129 # (cosyvoice3 →

130 # FunAudioLLM/

131 # CosyVoice)

132 # Default 'main' preserves

133 # current behavior; flipping a

134 # GPU engine to 'venv' requires

135 # the matching worker file in

136 # Nunba (or the dispatch falls

137 # back to in-process import,

138 # which only works if the engine

139 # is also installed in main).

140 sample_rate: int = 24000

141

142

143# Shared pip-spec constants — keep here so the install plans below stay

144# readable and so a single edit updates every engine that pins them.

145#

146# huggingface_hub 0.29+ removes is_offline_mode that transformers <5.x

147# still imports, so we cap below 0.29 for the chatterbox / kokoro chain.

148_HF_HUB_PIN = 'huggingface_hub>=0.27.0,<0.29.0'

149

150# Chatterbox plan — `chatterbox-tts` on PyPI omits MULTIPLE runtime

151# imports from its install_requires. Each one only surfaces when the

152# install proceeds far enough for the next one to be reached:

153#

154# chatterbox/__init__.py:9 → from .tts import ChatterboxTTS

155# chatterbox/tts.py:4 → import librosa (missing #1)

156# chatterbox/tts.py:6 → import perth (missing #2)

157#

158# Each was discovered from a real failed install at

159# ~/Documents/Nunba/logs/probe_chatterbox_turbo.err on the user's

160# desktop — first librosa, then once that was added, perth. Listing

161# them all here means a fresh chatterbox install completes in one

162# pip pass instead of needing 2-3 self-heal iterations (each of

163# which downloads ~10 MB of pip metadata). The Nunba self-heal

164# loop catches future un-declared transitives on the install screen

165# without surfacing a synth failure to the user.

166_CHATTERBOX_PIP_PLAN: Tuple[str, ...] = (

167 _HF_HUB_PIN,

168 'torchaudio',

169 'chatterbox-tts',

170 'librosa', # missing transitive #1 — chatterbox/tts.py:4

171 'soundfile', # librosa needs it on Windows for non-WAV outputs

172 'resemble-perth', # missing transitive #2 — chatterbox/tts.py:6

173 # `import perth`; PyPI pkg name = resemble-perth

174 # (the watermark library Resemble AI uses to

175 # tag synthesized audio).

176 # NOTE on the rest of chatterbox-tts==0.1.7's requires_dist

177 # (omegaconf, conformer, pyloudnorm, pykakasi, spacy-pkuseg,

178 # diffusers, einops, s3tokenizer, etc.):

179 # We deliberately do NOT pre-install them in one pip pass.

180 # When pip is asked to install many at once with

181 # `--no-build-isolation` (frozen build constraint, see

182 # package_installer._run_pip), and one of the transitives needs

183 # a source build (omegaconf → antlr4-python3-runtime==4.9.* is

184 # sdist-only on PyPI), pip's parallel-builds path races against

185 # the bundle's setuptools and surfaces as

186 # BackendUnavailable: Cannot import 'setuptools.build_meta'

187 # (observed 2026-04-28 on the user's bundle f2d4567 — full pip

188 # invocation aborts rc=2, no transitive gets installed).

189 # _self_heal_missing_transitives in package_installer.py handles

190 # them one-at-a-time AFTER the chatterbox-tts top-level install

191 # — single-package mode never triggers the parallel-build race.

192 # Combined with the PYTHONNOUSERSITE=1 fix in tts/_torch_probe.py

193 # (probe no longer leaks system Python's site-packages), each

194 # heal cycle finds a REAL missing transitive, not a phantom one.

195 # The original 5-cycle trail (librosa → perth → einops →

196 # s3tokenizer → omegaconf) is fine because each cycle resolves

197 # in ~10-30s of single-package pip work, not 5 minutes of

198 # parallel-build resolver thrash.

199)

200

201

202# All known TTS engines

203ENGINE_REGISTRY: Dict[str, TTSEngineSpec] = {

204 'chatterbox_turbo': TTSEngineSpec(

205 engine_id='chatterbox_turbo',

206 device=TTSDevice.GPU_ONLY,

207 vram_key='tts_chatterbox_turbo',

208 languages=('en',),

209 quality=0.95,

210 voice_clone=True,

211 latency_gpu_ms=150,

212 latency_cpu_ms=0,

213 latency_cloud_ms=0,

214 tool_module='integrations.service_tools.chatterbox_tool',

215 tool_function='chatterbox_synthesize',

216 tool_worker_attr='_turbo',

217 required_package='chatterbox',

218 pip_install_plan=_CHATTERBOX_PIP_PLAN,

219 # chatterbox-tts 0.1.7 hard-pins torch==2.6.0, transformers==5.2.0,

220 # numpy<2.0.0, diffusers==0.29.0, safetensors==0.5.3 — all in

221 # direct conflict with HARTOS's main interpreter (torch 2.11,

222 # transformers 5.1, numpy 2.4, diffusers 0.37, safetensors 0.7).

223 # Auto-heal can never satisfy these because main-interpreter

224 # downgrades would break llama-server, indic_parler, faster-whisper,

225 # and every other ML stack. Quarantine into its own venv —

226 # same pattern indic_parler uses (parler-tts pinned

227 # transformers<4.47). Nunba's tts/package_installer.py routes

228 # the install into ~/.nunba/venvs/chatterbox_turbo/, and the

229 # HARTOS ToolWorker's python_exe is set to the venv's python

230 # at runtime via desktop/_wire_venv_engines.py at boot, so the

231 # synth subprocess sees the pinned chatterbox-compatible deps

232 # instead of the main interpreter's incompatible newer ones.

233 install_target='venv',

234 ),

235 # luxtts REMOVED — poor audio quality, not suitable for any use case.

236 'cosyvoice3': TTSEngineSpec(

237 engine_id='cosyvoice3',

238 device=TTSDevice.GPU_ONLY,

239 vram_key='tts_cosyvoice3',

240 languages=('zh', 'ja', 'ko', 'de', 'es', 'fr', 'it', 'ru', 'en'),

241 quality=0.92,

242 voice_clone=True,

243 latency_gpu_ms=200,

244 latency_cpu_ms=0,

245 latency_cloud_ms=0,

246 tool_module='integrations.service_tools.cosyvoice_tool',

247 tool_function='cosyvoice_synthesize',

248 tool_worker_attr='_tool',

249 required_package='cosyvoice',

250 # cosyvoice is NOT pip-installable — needs a `git clone` of

251 # FunAudioLLM/CosyVoice plus model weight download via

252 # huggingface_hub. Empty plan + install_target='git_clone'

253 # signals Nunba to skip the pip path entirely and route

254 # through its git-clone install handler instead. The

255 # verify-synth probe must also short-circuit on git_clone

256 # engines when the package isn't importable (current Nunba

257 # bug: probe runs `import cosyvoice` blindly + always fails).

258 pip_install_plan=(),

259 install_target='git_clone',

260 ),

261 'f5_tts': TTSEngineSpec(

262 engine_id='f5_tts',

263 device=TTSDevice.GPU_ONLY,

264 vram_key='tts_f5',

265 languages=('en', 'zh'),

266 quality=0.91,

267 voice_clone=True,

268 latency_gpu_ms=200,

269 latency_cpu_ms=0,

270 latency_cloud_ms=0,

271 tool_module='integrations.service_tools.f5_tts_tool',

272 tool_function='f5_synthesize',

273 tool_worker_attr='_tool',

274 required_package='f5_tts',

275 pip_install_plan=('torchaudio', 'f5-tts'),

276 ),

277 'indic_parler': TTSEngineSpec(

278 engine_id='indic_parler',

279 device=TTSDevice.GPU_ONLY,

280 vram_key='tts_indic_parler',

281 languages=(

282 'hi', 'ta', 'te', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ur',

283 'as', 'bho', 'doi', 'kok', 'mai', 'mni', 'ne', 'sa', 'sat', 'sd', 'en',

284 ),

285 quality=0.90,

286 voice_clone=False,

287 latency_gpu_ms=300,

288 latency_cpu_ms=0,

289 latency_cloud_ms=0,

290 tool_module='integrations.service_tools.indic_parler_tool',

291 tool_function='indic_parler_synthesize',

292 tool_worker_attr='_tool',

293 required_package='parler_tts',

294 # Indic Parler quarantines into its own venv on the desktop —

295 # parler-tts 0.2.2 hard-pins transformers<4.47 which conflicts

296 # with the main interpreter's transformers 5.1.0. The full

297 # pip plan lives here so it travels with the engine spec; the

298 # desktop installer routes the install into the venv when

299 # install_target='venv'. Worker file:

300 # tts/indic_parler_worker.py (Nunba). HARTOS server side runs

301 # Indic Parler in its own subprocess worker so the main

302 # interpreter pin doesn't apply there either.

303 pip_install_plan=(

304 # tqdm + colorama pinned FIRST to stop pip's resolver from

305 # backtracking through colorama 0.1.x (no setup.py, breaks

306 # install). Witnessed user-facing failure:

307 # "Indic Parler TTS unavailable — using fallback voice engine"

308 # Root-caused from ~/Documents/Nunba/logs/venv_indic_parler.log.

309 'colorama>=0.4.6',

310 'tqdm>=4.65',

311 'transformers==4.46.1', # parler-tts 0.2.2 requires <4.47

312 'torch', # CPU-ish fallback; replaced by CUDA if GPU

313 'torchaudio',

314 'sentencepiece',

315 'descript-audio-codec',

316 'parler-tts==0.2.2', # 0.2.3 has DacModel.decode() API mismatch

317 'soundfile',

318 _HF_HUB_PIN,

319 ),

320 install_target='venv',

321 ),

322 'chatterbox_ml': TTSEngineSpec(

323 engine_id='chatterbox_ml',

324 device=TTSDevice.GPU_ONLY,

325 vram_key='tts_chatterbox_ml',

326 languages=(

327 'en', 'zh', 'ja', 'ko', 'de', 'es', 'fr', 'it', 'ru', 'pt',

328 'ar', 'nl', 'pl', 'sv', 'tr', 'hi', 'ta', 'te', 'bn', 'id',

329 'th', 'vi', 'cs',

330 ),

331 quality=0.94,

332 voice_clone=True,

333 latency_gpu_ms=300,

334 latency_cpu_ms=0,

335 latency_cloud_ms=0,

336 tool_module='integrations.service_tools.chatterbox_tool',

337 tool_function='chatterbox_ml_synthesize',

338 tool_worker_attr='_ml',

339 required_package='chatterbox',

340 pip_install_plan=_CHATTERBOX_PIP_PLAN,

341 ),

342 'pocket_tts': TTSEngineSpec(

343 engine_id='pocket_tts',

344 device=TTSDevice.CPU_ONLY,

345 vram_key='',

346 languages=('en',),

347 quality=0.85,

348 voice_clone=True,

349 latency_gpu_ms=0,

350 latency_cpu_ms=200,

351 latency_cloud_ms=0,

352 tool_module='integrations.service_tools.pocket_tts_tool',

353 tool_function='pocket_tts_synthesize',

354 pip_install_plan=('pocket-tts',),

355 ),

356 # NeuTTS Air — Neuphonic 748M-param Qwen2 backbone with NeuCodec

357 # decoder, Apache 2.0. GGUF Q4 (~600MB) / Q8 (~800MB). RTF<0.5

358 # on CPU (Intel i5 / RPi 5), 24kHz output, instant voice cloning

359 # from 3-15s reference audio. English primary. Slots between

360 # omnivoice and kokoro on the English ladder per quality

361 # (kokoro=0.88, neutts=0.91, omnivoice~0.93, chatterbox=0.95).

362 #

363 # Reference voice contract: NeuTTS requires a reference audio +

364 # transcript per call (no built-in 'alba'-style zero-config

365 # voices). The wrapper resolves 'jo' (upstream sample shipped

366 # with the package), any path to a .wav with companion .txt,

367 # or a custom name from ~/.hevolve/models/tts/neutts/voices/.

368 # See integrations/service_tools/neutts_tool.py for resolution.

369 'neutts_air': TTSEngineSpec(

370 engine_id='neutts_air',

371 device=TTSDevice.GPU_PREFERRED,

372 vram_key='tts_neutts',

373 languages=('en',),

374 quality=0.91,

375 voice_clone=True,

376 latency_gpu_ms=150,

377 latency_cpu_ms=400,

378 latency_cloud_ms=0,

379 tool_module='integrations.service_tools.neutts_tool',

380 tool_function='neutts_synthesize',

381 # Worker attribute — Nunba's `_SubprocessTTSBackend` needs this

382 # to drive the subprocess. Without it the spec falls into the

383 # `_InProcessTTSBackend` path (line ~2408 of Nunba's tts_engine

384 # .py) which in turn does `import neutts` from the MAIN

385 # interpreter — a guaranteed ImportError because `install_target

386 # ='venv'` lands the package in the per-engine venv, not the

387 # main python-embed. Setting `_tool` here pairs cleanly with

388 # the ToolWorker singleton in integrations.service_tools.neutts

389 # _tool, mirroring kokoro / chatterbox / f5 / indic_parler.

390 tool_worker_attr='_tool',

391 required_package='neutts',

392 # `neutts[all]` pulls llama-cpp-python (for GGUF inference)

393 # plus soundfile + onnxruntime. The base `neutts` package

394 # alone is not enough for synth — the codec decoder needs

395 # onnxruntime. Pin huggingface_hub via _HF_HUB_PIN so the

396 # transformers chain stays consistent with the rest of the

397 # English ladder (chatterbox / kokoro use the same pin).

398 pip_install_plan=(

399 _HF_HUB_PIN,

400 'neutts[all]',

401 'soundfile', # explicit — wrapper requires soundfile.write

402 ),

403 # Quarantine into its own venv on the desktop installer.

404 # NeuTTS pulls llama-cpp-python which can drift from the

405 # main interpreter's torch / numpy stack. Same pattern as

406 # chatterbox_turbo and indic_parler.

407 install_target='venv',

408 ),

409 # Kokoro 82M — tiny neural English TTS. Runs on CPU (≈1× real-time,

410 # 200MB RAM) or GPU (≈0.1× real-time, 200MB VRAM). Quality sits

411 # above Piper and below the big voice-clone engines, so it's the

412 # right second rung on the English ladder — tried when the GPU

413 # engines can't run (no CUDA, VRAM full, package missing) but

414 # BEFORE we fall all the way down to Piper.

415 #

416 # Benchmark context (vs piper, on English):

417 # - quality: kokoro 0.88 vs piper 0.70 (subjective MOS gap)

418 # - cpu latency: kokoro 400ms vs piper 200ms (per ~10 words)

419 # - disk: kokoro 160MB vs piper 60MB (per voice)

420 # - voices: kokoro ~25 vs piper ~15 (per-language catalog)

421 # Piper still wins on raw CPU speed and disk, which is why it

422 # stays the absolute last-resort fallback.

423 'kokoro': TTSEngineSpec(

424 engine_id='kokoro',

425 device=TTSDevice.GPU_PREFERRED,

426 vram_key='tts_kokoro',

427 languages=('en',),

428 quality=0.88,

429 voice_clone=False,

430 latency_gpu_ms=120,

431 latency_cpu_ms=400,

432 latency_cloud_ms=0,

433 tool_module='integrations.service_tools.kokoro_tool',

434 tool_function='kokoro_synthesize',

435 tool_worker_attr='_tool',

436 required_package='kokoro',

437 pip_install_plan=(

438 _HF_HUB_PIN,

439 'kokoro', # pulls misaki phonemizer transitively

440 'espeakng', # espeak-ng Python bindings (ships binary on Windows)

441 ),

442 ),

443 # OmniVoice — universal TTS. Qwen3-0.6B backbone + diffusion head,

444 # 646 languages (581k training hours spanning every Indic script,

445 # zh/ja/ko, European, Arabic, low-resource). Zero-shot voice cloning

446 # from 3-10 s of reference audio. Apache 2.0.

447 #

448 # Languages tuple is ('*',) — same wildcard convention as espeak —

449 # but select_engines() only considers engines explicitly listed in

450 # LANG_ENGINE_PREFERENCE for the resolved language. We prepend

451 # 'omnivoice' to every Indic + non-English entry + _DEFAULT_PREFERENCE

452 # so it wins unless it's uninstalled or the GPU can't hold it.

453 #

454 # VRAM is stubbed at 3.0 GB in vram_manager.VRAM_BUDGETS; the worker

455 # self-reports actual usage on first load via '__WORKER_VRAM_GB__'

456 # and vram_manager.record_actual_usage tightens the budget.

457 'omnivoice': TTSEngineSpec(

458 engine_id='omnivoice',

459 device=TTSDevice.GPU_ONLY,

460 vram_key='tts_omnivoice',

461 languages=('*',), # 646 languages

462 quality=0.93,

463 voice_clone=True,

464 latency_gpu_ms=250,

465 latency_cpu_ms=0,

466 latency_cloud_ms=0,

467 tool_module='integrations.service_tools.omnivoice_tool',

468 tool_function='omnivoice_synthesize',

469 tool_worker_attr='_tool',

470 required_package='omnivoice',

471 # See omnivoice_tool.py docstring: "Requires: pip install

472 # omnivoice torch soundfile". torch is bundled.

473 pip_install_plan=('omnivoice', 'soundfile'),

474 ),

475 'espeak': TTSEngineSpec(

476 engine_id='espeak',

477 device=TTSDevice.CPU_ONLY,

478 vram_key='',

479 languages=('*',), # 100+ languages

480 quality=0.40,

481 voice_clone=False,

482 latency_gpu_ms=0,

483 latency_cpu_ms=10,

484 latency_cloud_ms=0,

485 tool_module='integrations.service_tools.pocket_tts_tool',

486 tool_function='pocket_tts_synthesize', # espeak is fallback inside pocket

487 install_target='bundled',

488 ),

489 'makeittalk': TTSEngineSpec(

490 engine_id='makeittalk',

491 device=TTSDevice.CLOUD,

492 vram_key='',

493 languages=('en',),

494 quality=0.88,

495 voice_clone=False,

496 latency_gpu_ms=0,

497 latency_cpu_ms=0,

498 latency_cloud_ms=5000,

499 tool_module=None, # Special cloud path in model_bus_service

500 tool_function=None,

501 install_target='cloud',

502 ),

503 # Piper — bundled CPU engine, multilingual via downloadable voice

504 # files. Uses ('*',) wildcard (same convention as espeak) so one

505 # spec covers every language Piper has voices for — no parallel

506 # per-language list. Runtime synth attempt raises on missing voice

507 # files and the router falls through to a neural engine.

508 'piper': TTSEngineSpec(

509 engine_id='piper',

510 device=TTSDevice.CPU_ONLY,

511 vram_key='',

512 languages=('*',),

513 quality=0.70,

514 voice_clone=False,

515 latency_gpu_ms=0,

516 latency_cpu_ms=200,

517 latency_cloud_ms=0,

518 tool_module=None, # In-process via Nunba tts/piper_tts.py —

519 # no subprocess worker, no required_package.

520 tool_function=None,

521 install_target='bundled',

522 ),

523 # ── Mid-VRAM coverage tier (1–3 GB) ───────────────────────────

524 # These three engines fill the gap so every SUPPORTED_LANG_DICT

525 # code has at least one engine with vram_gb≤3.0 in its preference

526 # ladder. Indic Parler (2.0) + F5 (2.5) cover en/zh + 22 Indic;

527 # the trio below adds the rest of the major language families

528 # without forcing users onto the 12-14 GB Chatterbox-ML or the

529 # uninstallable git-clone CosyVoice path.

530 'melotts': TTSEngineSpec(

531 engine_id='melotts',

532 device=TTSDevice.GPU_PREFERRED, # works on CPU at real-time too

533 vram_key='tts_melotts',

534 languages=('en', 'es', 'fr', 'zh', 'ja', 'ko'),

535 quality=0.86,

536 voice_clone=False,

537 latency_gpu_ms=180,

538 latency_cpu_ms=600,

539 latency_cloud_ms=0,

540 tool_module='integrations.service_tools.melotts_tool',

541 tool_function='melotts_synthesize',

542 tool_worker_attr='_tool',

543 required_package='melo', # `from melo.api import TTS`

544 pip_install_plan=(

545 _HF_HUB_PIN,

546 'melotts', # PyPI package; ships `melo` import root

547 'soundfile', # used for duration probe

548 ),

549 ),

550 'xtts_v2': TTSEngineSpec(

551 engine_id='xtts_v2',

552 device=TTSDevice.GPU_ONLY,

553 vram_key='tts_xtts_v2',

554 languages=(

555 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',

556 'cs', 'ar', 'zh', 'hu', 'ko', 'ja', 'hi',

557 ),

558 quality=0.92,

559 voice_clone=True,

560 latency_gpu_ms=350,

561 latency_cpu_ms=0,

562 latency_cloud_ms=0,

563 tool_module='integrations.service_tools.xtts_tool',

564 tool_function='xtts_synthesize',

565 tool_worker_attr='_tool',

566 required_package='TTS', # `from TTS.api import TTS`

567 pip_install_plan=(

568 _HF_HUB_PIN,

569 'coqui-tts', # idiap-maintained 2026 fork on PyPI;

570 # ships `from TTS.api import TTS` so

571 # the import path is stable.

572 'soundfile',

573 ),

574 ),

575 'mms_tts': TTSEngineSpec(

576 engine_id='mms_tts',

577 device=TTSDevice.GPU_PREFERRED, # CPU works, GPU faster

578 vram_key='tts_mms_tts',

579 languages=(

580 # Roman-script languages where mms_tts_tool routes without

581 # uroman. Non-Roman scripts (ar/hi/zh/ko/ja/...) ALSO have

582 # mms-tts checkpoints but require uroman pre-processing —

583 # the tool gracefully fails when uroman isn't installed and

584 # the router falls through to the next preference. We list

585 # the broader set here because the tool decides per-call

586 # whether it can serve; the router's job is to attempt.

587 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',

588 'cs', 'hu', 'sv', 'fi', 'el', 'ro', 'bg', 'uk', 'cy', 'is',

589 'zh', 'ja', 'ko', 'vi', 'th', 'id', 'ms', 'km', 'lo', 'my',

590 'hi', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or',

591 'ne', 'as', 'sd', 'sa', 'ur', 'si',

592 'ar', 'fa', 'he', 'sw',

593 ),

594 quality=0.78,

595 voice_clone=False,

596 latency_gpu_ms=200,

597 latency_cpu_ms=500,

598 latency_cloud_ms=0,

599 tool_module='integrations.service_tools.mms_tts_tool',

600 tool_function='mms_tts_synthesize',

601 tool_worker_attr='_tool',

602 required_package='transformers', # already bundled — no install plan

603 pip_install_plan=(

604 _HF_HUB_PIN,

605 'soundfile', # for WAV write

606 # uroman is OPTIONAL — only needed for non-Roman scripts.

607 # The tool falls through cleanly when missing, so we don't

608 # bundle the perl repo + extra pip dep into every install.

609 # Users who want broad Indic/Arabic/CJK coverage from MMS

610 # specifically can `pip install uroman` separately.

611 ),

612 ),

613}

614

615

616# ═══════════════════════════════════════════════════════════════

617# Language → Engine Preference Table

618# ═══════════════════════════════════════════════════════════════

619

620# Ordered by quality for each language — first available wins

621LANG_ENGINE_PREFERENCE: Dict[str, List[str]] = {

622 # English ladder (quality-then-speed):

623 # 1. chatterbox_turbo — big GPU voice-clone, highest quality

624 # 2. kokoro — 82M neural, CPU-friendly, best non-GPU quality

625 # 3. pocket_tts — small cloneable fallback

626 # 4. cosyvoice3 — big multilingual GPU, usable for EN

627 # 5. piper — bundled CPU fallback, always ships

628 # 6. espeak — absolute last-resort phoneme synth

629 # luxtts dropped from default ladder (poor naturalness); still available

630 # for explicit voice-clone requests via direct engine selection.

631 # chatterbox_turbo wins on English quality; omnivoice sits above

632 # kokoro/pocket/cosyvoice for cross-engine consistency when the

633 # user also runs non-English traffic and we want to avoid swapping

634 # engines on every language switch.

635 # neutts_air slotted between omnivoice and melotts:

636 # - quality: chatterbox_turbo=0.95 > omnivoice~0.93 > neutts=0.91 > kokoro=0.88

637 # - cpu RTF: neutts <0.5 (acceptable on CPU for 8GB+ machines)

638 # - install: pip neutts[all] (GGUF Q4 ~600MB), Apache 2.0

639 # Behavior on missing package: wrapper returns clean {error: ...}

640 # JSON; ladder traverses to next engine — verified against

641 # tts/package_installer.py per-engine independent install

642 # contract + tts_engine._synthesize_with_fallback ladder walk.

643 'en': ['chatterbox_turbo', 'omnivoice', 'neutts_air', 'melotts', 'xtts_v2', 'kokoro', 'pocket_tts', 'cosyvoice3', 'mms_tts', 'piper', 'espeak'],

644 # Indic languages — omnivoice replaces indic_parler as the primary

645 # (parler kept as fallback for one release cycle). OmniVoice has

646 # 100-400 training hours per major Indic language vs parler's ~10,

647 # and adds voice cloning which parler lacks entirely. XTTS-v2

648 # adds Hindi (only); MMS-TTS adds the rest as 1 GB-tier coverage.

649 'hi': ['omnivoice', 'indic_parler', 'xtts_v2', 'chatterbox_ml', 'cosyvoice3', 'mms_tts', 'espeak'],

650 'ta': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

651 'te': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

652 'bn': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

653 'gu': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

654 'kn': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

655 'ml': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

656 'mr': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

657 'or': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

658 'pa': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

659 'ur': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

660 'as': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

661 'ne': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

662 'sa': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'],

663 'si': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], # Sinhala — mms-tts adds 1 GB-tier

664 'sd': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], # Sindhi — Indic Parler + mms

665 # CJK — omnivoice has 500k+ hours of CJK in training; promote over cosyvoice.

666 # MeloTTS slots above the heavy Chatterbox-ML for the 1.5 GB tier.

667 'zh': ['omnivoice', 'melotts', 'cosyvoice3', 'f5_tts', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

668 'ja': ['omnivoice', 'melotts', 'cosyvoice3', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

669 'ko': ['omnivoice', 'melotts', 'cosyvoice3', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

670 # European — XTTS-v2 (2.5 GB, voice clone) and MeloTTS (1.5 GB)

671 # slot above the 12 GB Chatterbox-ML so users on 4-8 GB GPUs get

672 # quality TTS without the 14 GB allocation that pushes other

673 # workers off the GPU.

674 'de': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],

675 'es': ['omnivoice', 'melotts', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],

676 'fr': ['omnivoice', 'melotts', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],

677 'it': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],

678 'ru': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'],

679 'pt': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

680 'ar': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

681 'nl': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

682 'pl': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

683 'sv': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],

684 'tr': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

685 'id': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],

686 'th': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],

687 'vi': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'],

688 'cs': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

689 # Newly-covered SUPPORTED_LANG_DICT entries — these had no

690 # explicit ladder before and would have hit _DEFAULT_PREFERENCE

691 # (omnivoice → chatterbox_ml → espeak), where chatterbox_ml needs

692 # 14 GB. MMS-TTS at 1 GB now provides the always-runnable fallback.

693 'hu': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

694 'el': ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'],

695 'fi': ['omnivoice', 'mms_tts', 'espeak'],

696 'ro': ['omnivoice', 'mms_tts', 'espeak'],

697 'bg': ['omnivoice', 'mms_tts', 'espeak'],

698 'uk': ['omnivoice', 'mms_tts', 'espeak'],

699 'cy': ['omnivoice', 'mms_tts', 'espeak'], # Welsh

700 'is': ['omnivoice', 'mms_tts', 'espeak'], # Icelandic

701 'ms': ['omnivoice', 'mms_tts', 'espeak'], # Malay

702 'fa': ['omnivoice', 'mms_tts', 'espeak'], # Persian (uroman)

703 'he': ['omnivoice', 'mms_tts', 'espeak'], # Hebrew (uroman)

704 'sw': ['omnivoice', 'mms_tts', 'espeak'], # Swahili

705 'km': ['omnivoice', 'mms_tts', 'espeak'], # Khmer (uroman)

706 'lo': ['omnivoice', 'mms_tts', 'espeak'], # Lao (uroman)

707 'my': ['omnivoice', 'mms_tts', 'espeak'], # Burmese (uroman)

708 # Additional Indic codes that exist in SUPPORTED_LANG_DICT but

709 # weren't in the language preference table previously — these

710 # ride Indic Parler's 22-language coverage, then mms_tts.

711 'brx': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Bodo

712 'doi': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Dogri

713 'kok': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Konkani

714 'mai': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Maithili

715 'mni': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Manipuri

716 'sat': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Santali

717 'ks': ['omnivoice', 'mms_tts', 'espeak'], # Kashmiri

718 # Misc that were previously routed via _DEFAULT_PREFERENCE only.

719 'lv': ['omnivoice', 'mms_tts', 'espeak'], # Latvian

720 'sr': ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'], # Serbian

721 'zh-cn': ['omnivoice', 'melotts', 'cosyvoice3', 'f5_tts', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'],

722}

723

724# Fallback for unlisted languages — omnivoice covers 646 + mms_tts covers

725# 1100+, so this is reached only when both are uninstalled / can't fit.

726# chatterbox_ml is the heaviest local clone, espeak is the absolute floor.

727_DEFAULT_PREFERENCE = ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak']

728

729

730# ═══════════════════════════════════════════════════════════════

731# Route result

732# ═══════════════════════════════════════════════════════════════

733

734class TTSLocation(Enum):

735 LOCAL = "local"

736 HIVE_PEER = "hive_peer"

737 CLOUD = "cloud"

738

739

740@dataclass

741class TTSCandidate:

742 """A scored TTS engine candidate."""

743 engine: TTSEngineSpec

744 location: TTSLocation

745 device: str # 'gpu', 'cpu', 'cloud'

746 estimated_latency_ms: int

747 quality_score: float

748 peer_address: Optional[str] = None # if location == HIVE_PEER

749 warnings: List[str] = field(default_factory=list)

750

751

752@dataclass

753class TTSResult:

754 """Result of a TTS synthesis."""

755 path: str

756 duration: float

757 engine_id: str

758 device: str

759 location: str

760 latency_ms: float

761 sample_rate: int

762 voice: str

763 quality_score: float

764 warnings: List[str] = field(default_factory=list)

765 error: Optional[str] = None

766

767 def to_dict(self) -> Dict[str, Any]:

768 d = {

769 'path': self.path,

770 'duration': self.duration,

771 'engine': self.engine_id,

772 'device': self.device,

773 'location': self.location,

774 'latency_ms': self.latency_ms,

775 'sample_rate': self.sample_rate,

776 'voice': self.voice,

777 'quality_score': self.quality_score,

778 }

779 if self.warnings:

780 d['warnings'] = self.warnings

781 if self.error:

782 d['error'] = self.error

783 return d

784

785

786# ═══════════════════════════════════════════════════════════════

787# Language Detection

788# ═══════════════════════════════════════════════════════════════

789

790def detect_language(text: str) -> str:

791 """Detect language of text. Returns ISO 639-1 code (e.g. 'en', 'hi').

792

793 Uses langdetect if available, falls back to heuristics.

794 """

795 if not text or not text.strip():

796 return 'en'

797 try:

798 from langdetect import detect

799 return detect(text)

800 except ImportError:

801 pass

802 except Exception:

803 pass

804

805 # Heuristic fallback: check Unicode script ranges

806 sample = text[:500]

807 devanagari = sum(1 for c in sample if '\u0900' <= c <= '\u097F')

808 cjk = sum(1 for c in sample if '\u4E00' <= c <= '\u9FFF')

809 hangul = sum(1 for c in sample if '\uAC00' <= c <= '\uD7AF')

810 katakana = sum(1 for c in sample if '\u30A0' <= c <= '\u30FF')

811 hiragana = sum(1 for c in sample if '\u3040' <= c <= '\u309F')

812 tamil = sum(1 for c in sample if '\u0B80' <= c <= '\u0BFF')

813 telugu = sum(1 for c in sample if '\u0C00' <= c <= '\u0C7F')

814 arabic = sum(1 for c in sample if '\u0600' <= c <= '\u06FF')

815 cyrillic = sum(1 for c in sample if '\u0400' <= c <= '\u04FF')

816 bengali = sum(1 for c in sample if '\u0980' <= c <= '\u09FF')

817 gujarati = sum(1 for c in sample if '\u0A80' <= c <= '\u0AFF')

818 kannada = sum(1 for c in sample if '\u0C80' <= c <= '\u0CFF')

819 malayalam = sum(1 for c in sample if '\u0D00' <= c <= '\u0D7F')

820

821 threshold = max(3, len(sample) * 0.1)

822 if devanagari > threshold:

823 return 'hi'

824 if tamil > threshold:

825 return 'ta'

826 if telugu > threshold:

827 return 'te'

828 if bengali > threshold:

829 return 'bn'

830 if gujarati > threshold:

831 return 'gu'

832 if kannada > threshold:

833 return 'kn'

834 if malayalam > threshold:

835 return 'ml'

836 if cjk > threshold:

837 return 'zh'

838 if hangul > threshold:

839 return 'ko'

840 if (katakana + hiragana) > threshold:

841 return 'ja'

842 if arabic > threshold:

843 return 'ar'

844 if cyrillic > threshold:

845 return 'ru'

846 return 'en'

847

848

849# ═══════════════════════════════════════════════════════════════

850# Engine Availability Detection

851# ═══════════════════════════════════════════════════════════════

852

853# Cache for engine availability (avoid repeated import checks)

854_engine_available_cache: Dict[str, Tuple[bool, float]] = {}

855_CACHE_TTL = 60.0 # seconds

856

857

858def _is_engine_installed(engine_id: str) -> bool:

859 """Check if a TTS engine's Python package is available.

860

861 TODO REFACTOR: move to model_catalog as ModelEntry.is_installed() —

862 a model that isn't pip-importable shouldn't be selectable by any caller.

863 """

864 now = time.time()

865 cached = _engine_available_cache.get(engine_id)

866 if cached and (now - cached[1]) < _CACHE_TTL:

867 return cached[0]

868

869 spec = ENGINE_REGISTRY.get(engine_id)

870 if not spec or not spec.tool_module:

871 _engine_available_cache[engine_id] = (False, now)

872 return False

873

874 available = False

875 try:

876 if engine_id == 'espeak':

877 # espeak availability checked via shutil

878 import shutil

879 available = shutil.which('espeak-ng') is not None or shutil.which('espeak') is not None

880 elif engine_id == 'pocket_tts':

881 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize # noqa: F401

882 available = True

883 elif engine_id == 'luxtts':

884 from integrations.service_tools.luxtts_tool import luxtts_synthesize # noqa: F401

885 available = True

886 elif engine_id == 'cosyvoice3':

887 from integrations.service_tools.cosyvoice_tool import cosyvoice_synthesize # noqa: F401

888 available = True

889 elif engine_id == 'indic_parler':

890 from integrations.service_tools.indic_parler_tool import indic_parler_synthesize # noqa: F401

891 available = True

892 elif engine_id in ('chatterbox_turbo', 'chatterbox_ml'):

893 from integrations.service_tools.chatterbox_tool import chatterbox_synthesize # noqa: F401

894 available = True

895 elif engine_id == 'f5_tts':

896 from integrations.service_tools.f5_tts_tool import f5_synthesize # noqa: F401

897 available = True

898 elif engine_id == 'kokoro':

899 from integrations.service_tools.kokoro_tool import kokoro_synthesize # noqa: F401

900 available = True

901 elif engine_id == 'melotts':

902 # `melotts` PyPI package ships the `melo` import root.

903 import importlib.util as _ils

904 available = _ils.find_spec('melo') is not None

905 elif engine_id == 'xtts_v2':

906 # `coqui-tts` PyPI package ships `from TTS.api import TTS`.

907 import importlib.util as _ils

908 available = _ils.find_spec('TTS') is not None

909 elif engine_id == 'mms_tts':

910 # transformers is bundled; check the VitsModel symbol so we

911 # detect outright-broken transformers installs early.

912 from transformers import VitsModel # noqa: F401

913 available = True

914 elif engine_id == 'makeittalk':

915 import os

916 available = bool(os.environ.get('MAKEITTALK_API_URL'))

917 except (ImportError, Exception):

918 available = False

919

920 _engine_available_cache[engine_id] = (available, now)

921 return available

922

923

924def _get_gpu_info() -> Dict[str, Any]:

925 """Get GPU info from VRAMManager (cached singleton)."""

926 try:

927 from integrations.service_tools.vram_manager import get_vram_manager

928 mgr = get_vram_manager()

929 return mgr.detect_gpu()

930 except (ImportError, Exception):

931 return {'cuda_available': False, 'total_gb': 0, 'free_gb': 0}

932

933

934def _can_fit_on_gpu(engine_id: str) -> bool: # TODO REFACTOR: remove — duplicates catalog.matches_compute()

935 """Check if this engine's model fits in available VRAM."""

936 spec = ENGINE_REGISTRY.get(engine_id)

937 if not spec or not spec.vram_key:

938 return False

939 try:

940 from integrations.service_tools.vram_manager import get_vram_manager

941 return get_vram_manager().can_fit(spec.vram_key)

942 except (ImportError, Exception):

943 return False

944

945

946def _get_compute_policy() -> Dict[str, Any]:

947 """Get user's compute policy (local_only / local_preferred / any)."""

948 try:

949 from integrations.agent_engine.compute_config import get_compute_policy

950 return get_compute_policy()

951 except (ImportError, Exception):

952 return {'compute_policy': 'local_preferred'}

953

954

955# ═══════════════════════════════════════════════════════════════

956# Hive Peer TTS Offload

957# ═══════════════════════════════════════════════════════════════

958

959def _find_hive_peer_for_tts(language: str) -> Optional[Dict[str, Any]]:

960 # TODO REFACTOR: move to orchestrator as find_peer_for(model_type, language) —

961 # hive peer offloading applies to all model types (STT, VLM, LLM), not just TTS.

962 """Find a hive peer with GPU that can serve TTS for this language.

963

964 Returns peer info dict or None.

965 """

966 try:

967 from integrations.agent_engine.compute_mesh_service import get_compute_mesh

968 mesh = get_compute_mesh()

969 if not mesh or not mesh.peers:

970 return None

971

972 for peer in mesh.peers.values():

973 if not peer.available_compute or peer.available_compute < 0.1:

974 continue

975 # Peer has GPU and capacity

976 caps = peer.capabilities or {}

977 if caps.get('gpu'):

978 return {

979 'peer_id': peer.peer_id,

980 'address': peer.address,

981 'latency_ms': peer.latency_ms or 500,

982 'gpu': caps.get('gpu', 'unknown'),

983 }

984 return None

985 except (ImportError, Exception):

986 return None

987

988

989def _offload_tts_to_peer(peer: Dict, text: str, language: str,

990 voice: Optional[str] = None) -> Optional[Dict]:

991 """Offload TTS synthesis to a hive peer via compute mesh (DRY — reuses mesh service)."""

992 try:

993 from integrations.agent_engine.compute_mesh_service import get_compute_mesh

994 mesh = get_compute_mesh()

995 if not mesh:

996 return None

997 result = mesh.offload_to_best_peer(

998 model_type='tts',

999 prompt=text,

1000 options={'language': language, 'voice': voice or 'default'},

1001 )

1002 if result and 'error' not in result:

1003 return result

1004 except (ImportError, Exception) as e:

1005 logger.debug("Hive TTS offload failed: %s", e)

1006 return None

1007

1008

1009# ═══════════════════════════════════════════════════════════════

1010# TTSRouter — the brain

1011# ═══════════════════════════════════════════════════════════════

1012

1013class TTSRouter:

1014 """Smart TTS engine selector and dispatcher.

1015

1016 Considers language, hardware, compute policy, latency, and hive peers

1017 to select the best engine for each synthesis request.

1018 """

1019

1020 def select_engines( # TODO REFACTOR: remove — catalog.select_best() is the single selector.

1021 # Language preferences feed into catalog via populate_tts_catalog()'s language_priority.

1022 # Move _is_engine_installed() to catalog, _find_hive_peer to orchestrator.

1023 self,

1024 text: str,

1025 language: Optional[str] = None,

1026 voice: Optional[str] = None,

1027 urgency: str = 'normal',

1028 require_clone: bool = False,

1029 ) -> List[TTSCandidate]:

1030 """Select and rank TTS engines for the given request.

1031

1032 Args:

1033 text: Text to synthesize

1034 language: ISO 639-1 code (auto-detected if None)

1035 voice: Voice reference (triggers clone-capable filter)

1036 urgency: 'instant' (fastest), 'normal', 'quality' (best quality)

1037 require_clone: Only return engines with voice cloning

1038

1039 Returns:

1040 Ranked list of TTSCandidate (best first), never empty

1041 """

1042 # Step 1: Detect language

1043 lang = language or detect_language(text)

1044 lang = lang[:2].lower() # normalize to 2-char code

1045

1046 # Step 2: Get preferred engines for this language

1047 preferred = LANG_ENGINE_PREFERENCE.get(lang, _DEFAULT_PREFERENCE)

1048

1049 # Step 3: Gather constraints

1050 gpu_info = _get_gpu_info()

1051 has_gpu = gpu_info.get('cuda_available', False)

1052 policy = _get_compute_policy()

1053 compute_mode = policy.get('compute_policy', 'local_preferred')

1054

1055 # Step 4: Score each candidate

1056 candidates: List[TTSCandidate] = []

1057 seen = set()

1058

1059 for engine_id in preferred:

1060 if engine_id in seen:

1061 continue

1062 seen.add(engine_id)

1063

1064 spec = ENGINE_REGISTRY.get(engine_id)

1065 if not spec:

1066 continue

1067

1068 # Voice cloning filter

1069 if require_clone and not spec.voice_clone:

1070 continue

1071

1072 warnings: List[str] = []

1073

1074 # --- LOCAL availability ---

1075 if spec.device == TTSDevice.CLOUD:

1076 # Cloud engines: skip if local_only

1077 if compute_mode == 'local_only':

1078 continue

1079 if _is_engine_installed(engine_id):

1080 candidates.append(TTSCandidate(

1081 engine=spec,

1082 location=TTSLocation.CLOUD,

1083 device='cloud',

1084 estimated_latency_ms=spec.latency_cloud_ms,

1085 quality_score=spec.quality,

1086 ))

1087 continue

1088

1089 if spec.device == TTSDevice.GPU_ONLY:

1090 if has_gpu and _can_fit_on_gpu(engine_id):

1091 if _is_engine_installed(engine_id):

1092 candidates.append(TTSCandidate(

1093 engine=spec,

1094 location=TTSLocation.LOCAL,

1095 device='gpu',

1096 estimated_latency_ms=spec.latency_gpu_ms,

1097 quality_score=spec.quality,

1098 ))

1099 continue

1100

1101 # GPU engine not available locally — try hive peer

1102 if compute_mode != 'local_only':

1103 peer = _find_hive_peer_for_tts(lang)

1104 if peer:

1105 candidates.append(TTSCandidate(

1106 engine=spec,

1107 location=TTSLocation.HIVE_PEER,

1108 device='gpu',

1109 estimated_latency_ms=spec.latency_gpu_ms + peer['latency_ms'],

1110 quality_score=spec.quality * 0.95, # slight penalty for network

1111 peer_address=peer['address'],

1112 warnings=[f"Offloaded to hive peer {peer['peer_id']}"],

1113 ))

1114 continue

1115

1116 if spec.device == TTSDevice.GPU_PREFERRED:

1117 if not _is_engine_installed(engine_id):

1118 continue

1119 if has_gpu and _can_fit_on_gpu(engine_id):

1120 candidates.append(TTSCandidate(

1121 engine=spec,

1122 location=TTSLocation.LOCAL,

1123 device='gpu',

1124 estimated_latency_ms=spec.latency_gpu_ms,

1125 quality_score=spec.quality,

1126 ))

1127 else:

1128 # CPU fallback

1129 candidates.append(TTSCandidate(

1130 engine=spec,

1131 location=TTSLocation.LOCAL,

1132 device='cpu',

1133 estimated_latency_ms=spec.latency_cpu_ms,

1134 quality_score=spec.quality * 0.9, # CPU quality slightly lower

1135 warnings=['Running on CPU (slower, install GPU for better perf)'],

1136 ))

1137 continue

1138

1139 if spec.device == TTSDevice.CPU_ONLY:

1140 if _is_engine_installed(engine_id):

1141 candidates.append(TTSCandidate(

1142 engine=spec,

1143 location=TTSLocation.LOCAL,

1144 device='cpu',

1145 estimated_latency_ms=spec.latency_cpu_ms,

1146 quality_score=spec.quality,

1147 ))

1148 continue

1149

1150 # Step 5: Always ensure espeak as ultimate fallback

1151 if not any(c.engine.engine_id == 'espeak' for c in candidates):

1152 espeak_spec = ENGINE_REGISTRY['espeak']

1153 candidates.append(TTSCandidate(

1154 engine=espeak_spec,

1155 location=TTSLocation.LOCAL,

1156 device='cpu',

1157 estimated_latency_ms=10,

1158 quality_score=espeak_spec.quality,

1159 warnings=['Fallback: no neural TTS available for this language'],

1160 ))

1161

1162 # Step 6: Sort by urgency-weighted score

1163 if urgency == 'instant':

1164 # Minimize latency — instant response

1165 candidates.sort(key=lambda c: (c.estimated_latency_ms, -c.quality_score))

1166 elif urgency == 'quality':

1167 # Maximize quality — don't care about latency

1168 candidates.sort(key=lambda c: (-c.quality_score, c.estimated_latency_ms))

1169 else:

1170 # Balance: quality * 0.6 + inverse_latency * 0.4

1171 max_latency = max(c.estimated_latency_ms for c in candidates) or 1

1172 candidates.sort(key=lambda c: -(

1173 c.quality_score * 0.6 +

1174 (1 - c.estimated_latency_ms / max_latency) * 0.4

1175 ))

1176

1177 return candidates

1178

1179 def synthesize(

1180 self,

1181 text: str,

1182 language: Optional[str] = None,

1183 voice: Optional[str] = None,

1184 output_path: Optional[str] = None,

1185 source: Optional[str] = None,

1186 urgency: str = 'normal',

1187 engine_override: Optional[str] = None,

1188 ) -> TTSResult:

1189 """Synthesize text using the best available TTS engine.

1190

1191 Tries engines in ranked order until one succeeds.

1192

1193 Args:

1194 text: Text to synthesize

1195 language: ISO 639-1 code (auto-detected if None)

1196 voice: Voice reference for cloning (path or saved name)

1197 output_path: Where to write WAV (auto-generated if None)

1198 source: Context hint (e.g. 'chat_response', 'greeting') —

1199 auto-maps to urgency via SOURCE_URGENCY

1200 urgency: 'instant' | 'normal' | 'quality' (used if source not set)

1201 engine_override: Force a specific engine (bypasses selection)

1202

1203 Returns:

1204 TTSResult with synthesis details

1205 """

1206 # Auto-infer urgency from source hint

1207 if source:

1208 urgency = SOURCE_URGENCY.get(source, urgency)

1209 if not text or not text.strip():

1210 return TTSResult(

1211 path='', duration=0, engine_id='none', device='none',

1212 location='none', latency_ms=0, sample_rate=0, voice='',

1213 quality_score=0, error='Text is required',

1214 )

1215

1216 lang = language or detect_language(text)

1217

1218 # Normalize numbers, currency, URLs, units to spoken form BEFORE

1219 # engine selection — every TTS engine benefits (single converging

1220 # path). Latency-sensitive ('instant' urgency) skips the LLM

1221 # fallback but keeps the fast rule pass.

1222 try:

1223 from integrations.channels.media.tts_text_normalizer import (

1224 normalize_for_tts,

1225 )

1226 text = normalize_for_tts(

1227 text, lang, use_llm=(urgency != 'instant'),

1228 )

1229 except Exception as _e: # never let normalization block synthesis

1230 logger.debug(f'tts normalization skipped: {_e}')

1231

1232 require_clone = voice is not None and voice not in ('default', '', None)

1233

1234 # Engine override

1235 if engine_override and engine_override in ENGINE_REGISTRY:

1236 spec = ENGINE_REGISTRY[engine_override]

1237 candidates = [TTSCandidate(

1238 engine=spec,

1239 location=TTSLocation.LOCAL,

1240 device='gpu' if spec.device in (TTSDevice.GPU_ONLY, TTSDevice.GPU_PREFERRED) else 'cpu',

1241 estimated_latency_ms=spec.latency_gpu_ms or spec.latency_cpu_ms,

1242 quality_score=spec.quality,

1243 )]

1244 else:

1245 candidates = self.select_engines(

1246 text, lang, voice, urgency, require_clone,

1247 )

1248

1249 # Try each candidate in order

1250 all_warnings = []

1251 for candidate in candidates:

1252 t0 = time.time()

1253 try:

1254 result = self._execute(candidate, text, lang, voice, output_path)

1255 elapsed = (time.time() - t0) * 1000

1256 if result and not result.get('error'):

1257 all_warnings.extend(candidate.warnings)

1258 return TTSResult(

1259 path=result.get('path', ''),

1260 duration=result.get('duration', 0),

1261 engine_id=candidate.engine.engine_id,

1262 device=candidate.device,

1263 location=candidate.location.value,

1264 latency_ms=round(elapsed, 1),

1265 sample_rate=result.get('sample_rate', candidate.engine.sample_rate),

1266 voice=result.get('voice', voice or 'default'),

1267 quality_score=candidate.quality_score,

1268 warnings=all_warnings,

1269 )

1270 else:

1271 err = result.get('error', 'unknown') if result else 'no result'

1272 all_warnings.append(

1273 f"{candidate.engine.engine_id} failed: {err}"

1274 )

1275 except Exception as e:

1276 all_warnings.append(f"{candidate.engine.engine_id} error: {e}")

1277 logger.debug("TTS engine %s failed: %s", candidate.engine.engine_id, e)

1278

1279 # All engines failed

1280 return TTSResult(

1281 path='', duration=0, engine_id='none', device='none',

1282 location='none', latency_ms=0, sample_rate=0, voice='',

1283 quality_score=0, warnings=all_warnings,

1284 error='All TTS engines failed',

1285 )

1286

1287 def _execute(

1288 self, candidate: TTSCandidate, text: str,

1289 language: str, voice: Optional[str], output_path: Optional[str],

1290 ) -> Optional[Dict[str, Any]]:

1291 """Execute TTS on a specific candidate engine."""

1292

1293 # Hive peer offload

1294 if candidate.location == TTSLocation.HIVE_PEER:

1295 peer_info = {

1296 'address': candidate.peer_address,

1297 'peer_id': 'hive',

1298 'latency_ms': candidate.estimated_latency_ms,

1299 }

1300 result = _offload_tts_to_peer(peer_info, text, language, voice)

1301 return result

1302

1303 # Cloud (MakeItTalk)

1304 if candidate.location == TTSLocation.CLOUD:

1305 return self._execute_makeittalk(text, voice)

1306

1307 # Local engine

1308 engine_id = candidate.engine.engine_id

1309 spec = candidate.engine

1310

1311 if engine_id == 'luxtts':

1312 return self._call_luxtts(text, voice, output_path, candidate.device)

1313 elif engine_id == 'pocket_tts':

1314 return self._call_pocket_tts(text, voice, output_path)

1315 elif engine_id == 'espeak':

1316 return self._call_espeak(text, language, output_path)

1317 elif engine_id == 'cosyvoice3':

1318 return self._call_gpu_engine(

1319 'integrations.service_tools.cosyvoice_tool',

1320 'cosyvoice_synthesize',

1321 text, language, voice, output_path,

1322 )

1323 elif engine_id == 'indic_parler':

1324 return self._call_gpu_engine(

1325 'integrations.service_tools.indic_parler_tool',

1326 'indic_parler_synthesize',

1327 text, language, voice, output_path,

1328 )

1329 elif engine_id == 'chatterbox_turbo':

1330 return self._call_gpu_engine(

1331 'integrations.service_tools.chatterbox_tool',

1332 'chatterbox_synthesize',

1333 text, language, voice, output_path,

1334 )

1335 elif engine_id == 'chatterbox_ml':

1336 return self._call_gpu_engine(

1337 'integrations.service_tools.chatterbox_tool',

1338 'chatterbox_ml_synthesize',

1339 text, language, voice, output_path,

1340 )

1341 elif engine_id == 'f5_tts':

1342 return self._call_gpu_engine(

1343 'integrations.service_tools.f5_tts_tool',

1344 'f5_synthesize',

1345 text, language, voice, output_path,

1346 )

1347 elif engine_id == 'kokoro':

1348 return self._call_gpu_engine(

1349 'integrations.service_tools.kokoro_tool',

1350 'kokoro_synthesize',

1351 text, language, voice, output_path,

1352 )

1353 elif engine_id == 'melotts':

1354 return self._call_gpu_engine(

1355 'integrations.service_tools.melotts_tool',

1356 'melotts_synthesize',

1357 text, language, voice, output_path,

1358 )

1359 elif engine_id == 'xtts_v2':

1360 return self._call_gpu_engine(

1361 'integrations.service_tools.xtts_tool',

1362 'xtts_synthesize',

1363 text, language, voice, output_path,

1364 )

1365 elif engine_id == 'mms_tts':

1366 return self._call_gpu_engine(

1367 'integrations.service_tools.mms_tts_tool',

1368 'mms_tts_synthesize',

1369 text, language, voice, output_path,

1370 )

1371 return {'error': f'Unknown engine: {engine_id}'}

1372

1373 def _call_luxtts(self, text, voice, output_path, device):

1374 from integrations.service_tools.luxtts_tool import luxtts_synthesize

1375 result_str = luxtts_synthesize(

1376 text, voice_audio=voice, output_path=output_path, device=device,

1377 )

1378 return json.loads(result_str)

1379

1380 def _call_pocket_tts(self, text, voice, output_path):

1381 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize

1382 voice_name = voice if voice and voice != 'default' else 'alba'

1383 result_str = pocket_tts_synthesize(text, voice_name, output_path)

1384 return json.loads(result_str)

1385

1386 def _call_espeak(self, text, language, output_path):

1387 """Call espeak-ng via pocket_tts_tool (DRY — reuses existing impl)."""

1388 import os

1389

1390 if not output_path:

1391 out_dir = os.environ.get('TTS_TEMP_DIR', '/tmp/tts')

1392 os.makedirs(out_dir, exist_ok=True)

1393 output_path = os.path.join(out_dir, f'espeak_{int(time.time()*1000)}.wav')

1394

1395 try:

1396 from integrations.service_tools.pocket_tts_tool import _espeak_synthesize

1397 espeak_lang = language if language else 'en'

1398 if _espeak_synthesize(text[:5000], output_path, voice=espeak_lang):

1399 return {

1400 'path': output_path,

1401 'duration': len(text.split()) / 150 * 60, # estimate

1402 'sample_rate': 22050,

1403 'voice': espeak_lang,

1404 'engine': 'espeak-ng',

1405 }

1406 return {'error': 'espeak-ng not installed'}

1407 except (ImportError, Exception):

1408 return {'error': 'espeak-ng not available'}

1409

1410 def _call_gpu_engine(self, module_path, function_name, text, language,

1411 voice, output_path):

1412 """Generic caller for GPU TTS service tools."""

1413 import importlib

1414 try:

1415 mod = importlib.import_module(module_path)

1416 fn = getattr(mod, function_name)

1417 result_str = fn(text, language=language, voice=voice,

1418 output_path=output_path)

1419 return json.loads(result_str)

1420 except ImportError as e:

1421 return {'error': f'{module_path} not installed: {e}'}

1422 except Exception as e:

1423 return {'error': str(e)}

1424

1425 def _execute_makeittalk(self, text, voice):

1426 """Cloud TTS via MakeItTalk API."""

1427 import os

1428 base_url = os.environ.get('MAKEITTALK_API_URL')

1429 if not base_url:

1430 return {'error': 'MAKEITTALK_API_URL not set'}

1431 try:

1432 import requests

1433 resp = requests.post(

1434 f"{base_url}/video-gen/",

1435 json={

1436 'text': text,

1437 'voiceName': voice or 'af_bella',

1438 'audio_only': True,

1439 },

1440 timeout=30,

1441 )

1442 if resp.status_code == 200:

1443 data = resp.json()

1444 audio_url = data.get('audio_url') or data.get('url', '')

1445 return {

1446 'path': audio_url,

1447 'duration': data.get('duration', 0),

1448 'voice': voice or 'af_bella',

1449 'engine': 'makeittalk',

1450 'sample_rate': 24000,

1451 }

1452 return {'error': f'MakeItTalk HTTP {resp.status_code}'}

1453 except Exception as e:

1454 return {'error': f'MakeItTalk: {e}'}

1455

1456 def get_engine_status(self) -> List[Dict[str, Any]]:

1457 """Report status of all TTS engines for diagnostics."""

1458 gpu_info = _get_gpu_info()

1459 has_gpu = gpu_info.get('cuda_available', False)

1460 statuses = []

1461

1462 for eid, spec in ENGINE_REGISTRY.items():

1463 installed = _is_engine_installed(eid)

1464 can_run = False

1465 device = 'n/a'

1466

1467 if spec.device == TTSDevice.CPU_ONLY:

1468 can_run = installed

1469 device = 'cpu'

1470 elif spec.device == TTSDevice.GPU_ONLY:

1471 can_run = installed and has_gpu and _can_fit_on_gpu(eid)

1472 device = 'gpu' if can_run else 'n/a'

1473 elif spec.device == TTSDevice.GPU_PREFERRED:

1474 can_run = installed

1475 device = 'gpu' if (has_gpu and _can_fit_on_gpu(eid)) else 'cpu'

1476 elif spec.device == TTSDevice.CLOUD:

1477 can_run = installed

1478 device = 'cloud'

1479

1480 statuses.append({

1481 'engine': eid,

1482 'installed': installed,

1483 'can_run': can_run,

1484 'device': device,

1485 'languages': list(spec.languages),

1486 'quality': spec.quality,

1487 'voice_clone': spec.voice_clone,

1488 'vram_gb': spec.vram_key,

1489 })

1490

1491 return statuses

1492

1493 def get_all_voices(self) -> List[Dict[str, Any]]:

1494 """Aggregate available voices from all installed TTS engines."""

1495 voices: List[Dict[str, Any]] = []

1496 try:

1497 from integrations.service_tools.pocket_tts_tool import (

1498 _BUILTIN_VOICES,

1499 )

1500 for v in _BUILTIN_VOICES:

1501 voices.append({'id': v, 'engine': 'pocket_tts', 'type': 'builtin'})

1502 except (ImportError, Exception):

1503 pass

1504 try:

1505 from integrations.service_tools.luxtts_tool import luxtts_list_voices

1506 import json as _json

1507 result = _json.loads(luxtts_list_voices())

1508 for v in result.get('voices', []):

1509 voices.append({'id': v.get('id', ''), 'engine': 'luxtts', 'type': 'cloned'})

1510 except (ImportError, Exception):

1511 pass

1512 return voices

1513

1514

1515# ═══════════════════════════════════════════════════════════════

1516# Singleton

1517# ═══════════════════════════════════════════════════════════════

1518

1519_router_instance: Optional[TTSRouter] = None

1520

1521

1522def get_tts_router() -> TTSRouter:

1523 """Get the singleton TTS router."""

1524 global _router_instance

1525 if _router_instance is None:

1526 _router_instance = TTSRouter()

1527 return _router_instance

1528

1529

1530# ═══════════════════════════════════════════════════════════════

1531# ModelCatalog integration — populate_tts_catalog()

1532# ═══════════════════════════════════════════════════════════════

1533

1534# Reflection-dispatch contract for catalog entries that have NO

1535# `tool_module` (pure-JSON model registration via admin UI / hive

1536# federation / model_catalog.json edit). An entry without `tool_module`

1537# MUST declare every field below in its `capabilities` dict — otherwise

1538# the dispatcher has no way to know how to instantiate the class, marshal

1539# the request, or normalize the return. See task #58 for the full

1540# rationale; the schema is finalized at 5 fields, no more.

1541_REFLECTION_FIELDS: Tuple[str, ...] = (

1542 'import_path', # 'pkg.module:ClassName'

1543 'init_args', # dict — kwargs for ClassName(**init_args); {} OK

1544 'synth_method', # str — instance method name

1545 'params_map', # dict — {payload_key → method_kwarg}

1546 'output_format', # canonical id (see _OUTPUT_FORMATS below)

1547)

1548

1549# Canonical return-shape identifiers the reflection dispatcher knows

1550# how to normalize into a wire-format wav (or path). Engines that

1551# return shapes outside this set MUST use the `tool_module` escape

1552# hatch instead — the dispatcher won't guess.

1553_OUTPUT_FORMATS: Tuple[str, ...] = (

1554 'wav_bytes', # bytes object holding a WAV-formatted byte stream

1555 'numpy_24k', # 1-D float32 numpy array @ 24 kHz mono

1556 'file_path', # str path to a wav file the engine wrote

1557 'bytesio', # io.BytesIO containing wav bytes

1558)

1559

1560

1561def _validate_engine_caps(caps: Dict[str, Any]) -> Optional[str]:

1562 """Validate a TTS catalog entry's capabilities dict.

1563

1564 Returns None when the entry is dispatchable, OR a human-readable

1565 error string when it is not. Two valid shapes:

1566

1567 1. Python-tool path (escape hatch):

1568 caps['tool_module'] = 'pkg.module' # required

1569 The entry will be dispatched via the existing

1570 `gpu_worker._dispatch_and_run` path: import the module, pick

1571 up `_load[_<variant>]` / `_synthesize[_<variant>]` callbacks

1572 by convention. This is what every code-shipped engine in

1573 ENGINE_REGISTRY uses today.

1574

1575 2. Pure-config / reflection path:

1576 caps lacks tool_module BUT declares ALL of _REFLECTION_FIELDS.

1577 The dispatcher will use reflection to instantiate the class

1578 and call the synth method — no .py file needed for adding

1579 new models that fit a homogeneous load+method API (Kokoro,

1580 Pocket-TTS, etc., evaluated empirically per engine).

1581

1582 Validation fires at INGEST time (populate_tts_catalog upsert path

1583 AND _catalog_entry_to_spec read path) so a malformed entry cannot

1584 reach the dispatcher. This guards against the "user discovers the

1585 error only when they request the voice" failure mode.

1586 """

1587 if not isinstance(caps, dict):

1588 return f'capabilities must be a dict, got {type(caps).__name__}'

1589

1590 if caps.get('tool_module'):

1591 # Python-tool entry — tool_module on its own is sufficient. The

1592 # dispatcher will pick up _load / _synthesize via convention.

1593 return None

1594

1595 # Reflection entry — every field is required. No partial schemas.

1596 missing = [f for f in _REFLECTION_FIELDS if f not in caps]

1597 if missing:

1598 return (

1599 f'entry has no tool_module and is missing reflection fields '

1600 f'{missing}; reflection dispatch needs the full 5-field '

1601 f'contract: {list(_REFLECTION_FIELDS)}'

1602 )

1603

1604 # Cheap shape sanity — early-fail with a precise message rather than

1605 # let the dispatcher trip on a bad type at synth time.

1606 if not isinstance(caps.get('init_args'), dict):

1607 return f'init_args must be a dict, got {type(caps.get("init_args")).__name__}'

1608 if not isinstance(caps.get('params_map'), dict):

1609 return f'params_map must be a dict, got {type(caps.get("params_map")).__name__}'

1610 if not isinstance(caps.get('synth_method'), str) or not caps['synth_method']:

1611 return 'synth_method must be a non-empty str'

1612 if not isinstance(caps.get('import_path'), str) or ':' not in caps['import_path']:

1613 return (

1614 f'import_path must be "pkg.module:ClassName", got '

1615 f'{caps.get("import_path")!r}'

1616 )

1617 if caps.get('output_format') not in _OUTPUT_FORMATS:

1618 return (

1619 f'output_format must be one of {list(_OUTPUT_FORMATS)}, got '

1620 f'{caps.get("output_format")!r}'

1621 )

1622 return None

1623

1624

1625# Human-readable display names for each engine (used in admin UI)

1626_ENGINE_DISPLAY_NAMES: Dict[str, str] = {

1627 'chatterbox_turbo': 'Chatterbox Turbo (GPU, English, voice-clone)',

1628 'luxtts': 'LuxTTS (CPU, English, voice-clone)',

1629 'cosyvoice3': 'CosyVoice 3 (GPU, multilingual, voice-clone)',

1630 'f5_tts': 'F5-TTS (GPU, EN/ZH, voice-clone)',

1631 'indic_parler': 'Indic Parler-TTS (GPU, 22 Indic languages)',

1632 'chatterbox_ml': 'Chatterbox Multilingual (GPU, 23 languages, voice-clone)',

1633 'pocket_tts': 'Pocket TTS (CPU, English, voice-clone)',

1634 'kokoro': 'Kokoro 82M (CPU/GPU, English, neural)',

1635 'espeak': 'eSpeak-NG (CPU, 100+ languages, instant fallback)',

1636 'makeittalk': 'MakeItTalk (Cloud, English)',

1637 'melotts': 'MeloTTS (CPU/GPU, 6 langs, neural)',

1638 'xtts_v2': 'XTTS-v2 (GPU, 17 langs, voice-clone)',

1639 'mms_tts': 'MMS-TTS (CPU/GPU, 50+ langs via VITS)',

1640}

1641

1642# Extra capabilities per engine that don't map 1-to-1 onto TTSEngineSpec fields

1643_ENGINE_EXTRA_CAPS: Dict[str, Dict[str, Any]] = {

1644 'chatterbox_turbo': {

1645 'streaming': False,

1646 'paralinguistic': ['emotion_happy', 'emotion_sad', 'emotion_angry',

1647 'emotion_surprised', 'laughing', 'whispering'],

1648 'emotion_tags': True,

1649 },

1650 'luxtts': {

1651 'streaming': False,

1652 'paralinguistic': [],

1653 'emotion_tags': False,

1654 },

1655 'cosyvoice3': {

1656 'streaming': True,

1657 'paralinguistic': ['emotion_happy', 'emotion_sad', 'whispering'],

1658 'emotion_tags': True,

1659 },

1660 'f5_tts': {

1661 'streaming': False,

1662 'paralinguistic': [],

1663 'emotion_tags': False,

1664 },

1665 'indic_parler': {

1666 'streaming': False,

1667 'paralinguistic': [],

1668 'emotion_tags': False,

1669 },

1670 'chatterbox_ml': {

1671 'streaming': False,

1672 'paralinguistic': ['emotion_happy', 'emotion_sad', 'whispering'],

1673 'emotion_tags': True,

1674 },

1675 'pocket_tts': {

1676 'streaming': False,

1677 'paralinguistic': [],

1678 'emotion_tags': False,

1679 },

1680 'kokoro': {

1681 'streaming': False,

1682 'paralinguistic': [],

1683 'emotion_tags': False,

1684 },

1685 'espeak': {

1686 'streaming': False,

1687 'paralinguistic': [],

1688 'emotion_tags': False,

1689 },

1690 'makeittalk': {

1691 'streaming': False,

1692 'paralinguistic': [],

1693 'emotion_tags': False,

1694 },

1695 'melotts': {

1696 'streaming': False,

1697 'paralinguistic': [],

1698 'emotion_tags': False,

1699 },

1700 'xtts_v2': {

1701 'streaming': False,

1702 'paralinguistic': [],

1703 'emotion_tags': False,

1704 },

1705 'mms_tts': {

1706 'streaming': False,

1707 'paralinguistic': [],

1708 'emotion_tags': False,

1709 },

1710}

1711

1712# Device → backend string mapping for ModelEntry.backend field

1713_DEVICE_TO_BACKEND: Dict[str, str] = {

1714 TTSDevice.GPU_ONLY.value: 'torch',

1715 TTSDevice.GPU_PREFERRED.value: 'torch',

1716 TTSDevice.CPU_ONLY.value: 'in_process',

1717 TTSDevice.CLOUD.value: 'api',

1718}

1719

1720# Device → supports_gpu / supports_cpu flags

1721_DEVICE_TO_COMPUTE: Dict[str, Tuple[bool, bool]] = {

1722 # (supports_gpu, supports_cpu)

1723 TTSDevice.GPU_ONLY.value: (True, False),

1724 TTSDevice.GPU_PREFERRED.value: (True, True),

1725 TTSDevice.CPU_ONLY.value: (False, True),

1726 TTSDevice.CLOUD.value: (False, False),

1727}

1728

1729# DEPRECATED: VRAM specs now live in vram_manager.VRAM_BUDGETS (single

1730# source of truth). Use _engine_vram_gb(engine_id) helper below.

1731# This dict is kept for backward compatibility but should NOT be edited.

1732_ENGINE_VRAM_GB: Dict[str, float] = {} # populated lazily by _engine_vram_gb

1733

1734

1735def _engine_vram_gb(engine_id: str) -> float:

1736 """Single source of truth for engine VRAM requirement.

1737

1738 Reads from vram_manager.VRAM_BUDGETS — the canonical specs.

1739 The vram_manager key convention is 'tts_<engine_id>' (e.g. 'tts_indic_parler').

1740 Returns 0.0 only if engine has no GPU requirement (CPU-only engine).

1741 Logs a warning if engine is GPU-capable but missing from VRAM_BUDGETS

1742 (catches drift between the two registries).

1743 """

1744 if engine_id in _ENGINE_VRAM_GB:

1745 return _ENGINE_VRAM_GB[engine_id]

1746 try:

1747 from integrations.service_tools.vram_manager import VRAM_BUDGETS

1748 key = f'tts_{engine_id}'

1749 if key in VRAM_BUDGETS:

1750 vram = VRAM_BUDGETS[key][0] # (gpu_gb, cpu_gb)

1751 _ENGINE_VRAM_GB[engine_id] = vram

1752 return vram

1753 # Engine not registered in vram_manager — log once, assume CPU

1754 logger.debug(

1755 "TTS engine %r has no VRAM_BUDGETS entry (key=%r) — "

1756 "assuming CPU-only. Add to vram_manager.VRAM_BUDGETS if GPU-capable.",

1757 engine_id, key,

1758 )

1759 except ImportError:

1760 logger.debug("vram_manager unavailable, assuming CPU-only for %r", engine_id)

1761 _ENGINE_VRAM_GB[engine_id] = 0.0

1762 return 0.0

1763

1764# Approximate disk footprint per engine (GB)

1765_ENGINE_DISK_GB: Dict[str, float] = {

1766 'chatterbox_turbo': 2.0,

1767 'luxtts': 0.5,

1768 'cosyvoice3': 3.5,

1769 'f5_tts': 2.5,

1770 'indic_parler': 4.0,

1771 'chatterbox_ml': 3.0,

1772 'pocket_tts': 0.1,

1773 'espeak': 0.05,

1774 'makeittalk': 0.0,

1775 'melotts': 1.5, # 6 per-lang checkpoints, ~250 MB each

1776 'xtts_v2': 2.0, # weights + speakers + config

1777 'mms_tts': 0.2, # ~150 MB per lang lazy-downloaded

1778}

1779

1780# Approximate RAM needed for CPU-capable engines (GB)

1781_ENGINE_RAM_GB: Dict[str, float] = {

1782 'chatterbox_turbo': 2.0,

1783 'luxtts': 2.0,

1784 'cosyvoice3': 4.0,

1785 'f5_tts': 2.0,

1786 'indic_parler': 4.0,

1787 'chatterbox_ml': 4.0,

1788 'pocket_tts': 0.5,

1789 'espeak': 0.1,

1790 'makeittalk': 0.1,

1791 'melotts': 2.0,

1792 'xtts_v2': 3.0,

1793 'mms_tts': 1.5,

1794}

1795

1796

1797def populate_tts_catalog(catalog) -> int:

1798 """Convert ENGINE_REGISTRY into ModelEntry objects and register them.

1799

1800 Called by ModelCatalog.populate_from_subsystems() via the populator

1801 plugin mechanism — keeps tts_router as the single source of truth for

1802 TTS engine capabilities.

1803

1804 Validation contract (#58): admin- or hive-supplied catalog entries

1805 that exist BEFORE this populator runs are validated against

1806 `_validate_engine_caps`. Invalid entries are removed from the

1807 catalog with a logged WARNING — they cannot reach the dispatcher.

1808 This is the "fail-fast at catalog ingest, not synth time" half of

1809 the contract; the other half (validation on every read) lives in

1810 `_catalog_entry_to_spec`.

1811

1812 Args:

1813 catalog: ModelCatalog instance (accepts Any to avoid a hard import

1814 at module level — the catalog is passed in by the caller).

1815

1816 Returns:

1817 Number of new entries added (skips already-registered IDs).

1818 """

1819 # Lazy import inside function body — avoids circular import at module load

1820 from integrations.service_tools.model_catalog import ModelEntry, ModelType

1821

1822 # Pre-pass: validate any existing TTS entries (admin/hive seeded the

1823 # catalog before us). Invalid entries are removed + logged so they

1824 # don't poison `_refresh_engine_registry_from_catalog` below. Code-

1825 # shipped engines (ENGINE_REGISTRY) ALWAYS have tool_module so they

1826 # never trip this; the gate exists for foreign manifests.

1827 _drop_ids: List[str] = []

1828 for entry in list(catalog.list_by_type('tts')):

1829 err = _validate_engine_caps(entry.capabilities or {})

1830 if err:

1831 logger.warning(

1832 'TTS catalog entry %r rejected at ingest: %s', entry.id, err,

1833 )

1834 _drop_ids.append(entry.id)

1835 # #58 Scope-2 (2026-05-07): reflection-only entries (caps lack

1836 # tool_module but declare the full 5-field contract) are now

1837 # dispatchable via `gpu_worker._dispatch_catalog_id` (`python -m

1838 # gpu_worker --catalog-id <id>`). They survive ingest as long as

1839 # `_validate_engine_caps` passes; they are EXCLUDED from the

1840 # ENGINE_REGISTRY snapshot by `_refresh_engine_registry_from_catalog`

1841 # because TTSEngineSpec carries `tool_module` as a non-optional

1842 # dispatch handle for the existing call sites. The catalog reads

1843 # them via the --catalog-id path instead.

1844 for _eid in _drop_ids:

1845 try:

1846 catalog.unregister(_eid, persist=False)

1847 except Exception as _re:

1848 logger.debug('failed to unregister invalid TTS entry %r: %s',

1849 _eid, _re)

1850

1851 added = 0

1852 for engine_id, spec in ENGINE_REGISTRY.items():

1853 # Skip if already registered (preserves user edits from admin UI)

1854 if catalog.get(f'tts-{engine_id.replace("_", "-")}') is not None:

1855 continue

1856

1857 device_value = spec.device.value

1858 supports_gpu, supports_cpu = _DEVICE_TO_COMPUTE.get(

1859 device_value, (False, True)

1860 )

1861 backend = _DEVICE_TO_BACKEND.get(device_value, 'in_process')

1862

1863 # Build language_priority from LANG_ENGINE_PREFERENCE:

1864 # lower rank in the preference list → lower priority number → preferred

1865 lang_priority: Dict[str, int] = {}

1866 for lang, engine_list in LANG_ENGINE_PREFERENCE.items():

1867 if engine_id in engine_list:

1868 rank = engine_list.index(engine_id) # 0 = most preferred

1869 lang_priority[lang] = rank * 10 # 0, 10, 20, ...

1870

1871 # Pick the best latency figure for quality/speed scores

1872 best_latency_ms = min(

1873 (v for v in (spec.latency_gpu_ms, spec.latency_cpu_ms,

1874 spec.latency_cloud_ms) if v > 0),

1875 default=5000,

1876 )

1877 # speed_score: 1.0 = instant (≤10 ms), 0.0 = very slow (≥5000 ms)

1878 speed_score = max(0.0, 1.0 - (best_latency_ms - 10) / 4990)

1879

1880 # Build capabilities dict — TTS-specific fields + extras

1881 extra = _ENGINE_EXTRA_CAPS.get(engine_id, {})

1882 capabilities: Dict[str, Any] = {

1883 'voice_clone': spec.voice_clone,

1884 'sample_rate': spec.sample_rate,

1885 'latency_gpu_ms': spec.latency_gpu_ms,

1886 'latency_cpu_ms': spec.latency_cpu_ms,

1887 'latency_cloud_ms': spec.latency_cloud_ms,

1888 'tool_module': spec.tool_module,

1889 'tool_function': spec.tool_function,

1890 'vram_key': spec.vram_key,

1891 'streaming': extra.get('streaming', False),

1892 'paralinguistic': extra.get('paralinguistic', []),

1893 'emotion_tags': extra.get('emotion_tags', False),

1894 }

1895

1896 # languages list — ('*',) means "all"; store as-is so select_best

1897 # language matching still works (catalog treats '*' as wildcard)

1898 languages = list(spec.languages)

1899

1900 entry = ModelEntry(

1901 id=f'tts-{engine_id.replace("_", "-")}',

1902 name=_ENGINE_DISPLAY_NAMES.get(engine_id, engine_id),

1903 model_type=ModelType.TTS,

1904 version='1.0',

1905 source='cloud' if spec.device == TTSDevice.CLOUD else 'local',

1906 vram_gb=_engine_vram_gb(engine_id),

1907 ram_gb=_ENGINE_RAM_GB.get(engine_id, 0.5),

1908 disk_gb=_ENGINE_DISK_GB.get(engine_id, 0.0),

1909 min_capability_tier='lite' if supports_cpu else 'standard',

1910 backend=backend,

1911 supports_gpu=supports_gpu,

1912 supports_cpu=supports_cpu,

1913 supports_cpu_offload=False,

1914 idle_timeout_s=300.0,

1915 capabilities=capabilities,

1916 quality_score=spec.quality,

1917 speed_score=round(speed_score, 3),

1918 priority=50,

1919 languages=languages,

1920 language_priority=lang_priority,

1921 tags=['tts', 'local' if spec.device != TTSDevice.CLOUD else 'cloud'],

1922 enabled=True,

1923 auto_load=False,

1924 )

1925 catalog.register(entry, persist=False)

1926 added += 1

1927

1928 # Post-upsert: rebuild ENGINE_REGISTRY in place so it reflects the

1929 # current catalog state (admin/hive-edited entries become visible

1930 # to existing call sites). Snapshot semantics — runtime catalog

1931 # mutations after this point do NOT auto-propagate; a re-bootstrap

1932 # is required. Matches the dict-iter assumption every existing

1933 # ENGINE_REGISTRY caller relies on. See task #58 acceptance #5.

1934 _refresh_engine_registry_from_catalog(catalog)

1935

1936 return added

1937

1938

1939def _refresh_engine_registry_from_catalog(catalog) -> int:

1940 """Rebuild ENGINE_REGISTRY in place from the post-upsert catalog.

1941

1942 Reflection-only entries (no tool_module) are excluded — they live

1943 only in the catalog and are dispatched via the `--catalog-id`

1944 path. TTSEngineSpec callers continue to see only spec-shaped

1945 entries, exactly as before this refactor.

1946

1947 Returns the number of entries in the rebuilt registry.

1948

1949 Idempotent: calling twice with the same catalog state produces the

1950 same registry contents.

1951 """

1952 new_entries: Dict[str, TTSEngineSpec] = {}

1953 for entry in catalog.list_by_type('tts'):

1954 spec = _catalog_entry_to_spec(entry)

1955 if spec is None:

1956 continue # validation failed, or reflection-only entry

1957 new_entries[spec.engine_id] = spec

1958 ENGINE_REGISTRY.clear()

1959 ENGINE_REGISTRY.update(new_entries)

1960 return len(new_entries)

1961

1962

1963def _catalog_entry_to_spec(entry) -> Optional[TTSEngineSpec]:

1964 """Convert a ModelCatalog ModelEntry back to a TTSEngineSpec.

1965

1966 Used by code that needs a TTSEngineSpec but only has a catalog entry

1967 (e.g. when the router consults the catalog for dynamically registered

1968 engines that were not present in ENGINE_REGISTRY at startup).

1969

1970 Returns None if:

1971 * the entry's capabilities fail validation (#58 contract — see

1972 `_validate_engine_caps`); the caller should NOT see that entry

1973 because the dispatcher cannot route to it.

1974 * the entry uses the reflection-only dispatch path (no tool_module).

1975 TTSEngineSpec carries `tool_module` as a non-optional dispatch

1976 handle for the existing call sites; reflection-only entries are

1977 dispatched directly from the catalog and are intentionally

1978 excluded from the ENGINE_REGISTRY snapshot.

1979 """

1980 caps = entry.capabilities or {}

1981 err = _validate_engine_caps(caps)

1982 if err:

1983 # Loud at ingest, silent on subsequent re-reads — the catalog

1984 # populator/loader already logged this; don't spam every read.

1985 return None

1986 tool_module = caps.get('tool_module')

1987 if not tool_module:

1988 # Valid reflection-only entry, but TTSEngineSpec needs a

1989 # tool_module. Caller (`_refresh_engine_registry_from_catalog`)

1990 # will skip None entries and dispatch reflection-only IDs via

1991 # the catalog path instead.

1992 return None

1993 tool_function = caps.get('tool_function')

1994

1995 # Determine TTSDevice from backend + supports_* flags

1996 if caps.get('latency_cloud_ms', 0) > 0 and not entry.supports_gpu and not entry.supports_cpu:

1997 device = TTSDevice.CLOUD

1998 elif entry.supports_gpu and not entry.supports_cpu:

1999 device = TTSDevice.GPU_ONLY

2000 elif entry.supports_gpu and entry.supports_cpu:

2001 device = TTSDevice.GPU_PREFERRED

2002 else:

2003 device = TTSDevice.CPU_ONLY

2004

2005 # Strip the 'tts-' prefix that populate_tts_catalog adds

2006 raw_id = entry.id[4:] if entry.id.startswith('tts-') else entry.id

2007

2008 return TTSEngineSpec(

2009 engine_id=raw_id,

2010 device=device,

2011 vram_key=caps.get('vram_key', ''),

2012 languages=tuple(entry.languages) if entry.languages else ('en',),

2013 quality=entry.quality_score,

2014 voice_clone=caps.get('voice_clone', False),

2015 latency_gpu_ms=caps.get('latency_gpu_ms', 0),

2016 latency_cpu_ms=caps.get('latency_cpu_ms', 0),

2017 latency_cloud_ms=caps.get('latency_cloud_ms', 0),

2018 tool_module=tool_module,

2019 tool_function=tool_function,

2020 sample_rate=caps.get('sample_rate', 24000),

2021 )

Coverage for integrations / channels / media / tts_router.py: 79.1%

521 statements