Coverage for integrations / channels / media / tts_router.py: 79.1%

521 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1"""Smart TTS Router — selects the best TTS engine based on constraints. 

2 

3Decision factors (in priority order): 

41. Language — which engines support the target language? 

52. Availability — is the engine installed locally? 

63. Hardware — GPU present? Enough VRAM? CPU-only fallback? 

74. Compute policy — local_only | local_preferred | any (hive offload) 

85. Latency — instant (espeak/browser) vs quality (neural) 

96. Voice cloning — only clone-capable engines if voice requested 

107. Hive peers — offload to GPU peer when local can't serve 

11""" 

12 

13import json 

14import logging 

15import time 

16from dataclasses import dataclass, field 

17from enum import Enum 

18from typing import Any, Dict, List, Optional, Tuple 

19 

20logger = logging.getLogger(__name__) 

21 

22# ═══════════════════════════════════════════════════════════════ 

23# Source → Urgency mapping (backend auto-infers, frontends send source) 

24# ═══════════════════════════════════════════════════════════════ 

25 

26SOURCE_URGENCY: Dict[str, str] = { 

27 'chat_response': 'normal', # Agent reply in chat 

28 'notification': 'instant', # System notification 

29 'greeting': 'instant', # Boot/login greeting 

30 'read_aloud': 'quality', # User clicked "speak this" 

31 'channel': 'normal', # Discord/Telegram response 

32 'cli': 'quality', # hart voice "text" 

33 'agent_tool': 'normal', # Agent using TTS tool 

34} 

35 

36# ═══════════════════════════════════════════════════════════════ 

37# Engine Registry — static capabilities of every TTS engine 

38# ═══════════════════════════════════════════════════════════════ 

39 

40class TTSDevice(Enum): 

41 GPU_ONLY = "gpu_only" 

42 GPU_PREFERRED = "gpu_preferred" # works on CPU too, GPU better 

43 CPU_ONLY = "cpu_only" 

44 CLOUD = "cloud" 

45 

46 

47@dataclass(frozen=True) 

48class TTSEngineSpec: 

49 """Static specification of a TTS engine's capabilities.""" 

50 engine_id: str 

51 device: TTSDevice 

52 vram_key: str # key in VRAM_BUDGETS (vram_manager.py) 

53 languages: Tuple[str, ...] # ISO 639-1 codes, or ('*',) for all 

54 quality: float # 0.0-1.0 subjective quality score 

55 voice_clone: bool 

56 latency_gpu_ms: int # estimated latency on GPU (0 if N/A) 

57 latency_cpu_ms: int # estimated latency on CPU (0 if N/A) 

58 latency_cloud_ms: int # estimated latency on cloud (0 if N/A) 

59 tool_module: Optional[str] # Python module path for the tool 

60 tool_function: Optional[str] # parent-side synthesize function name 

61 tool_worker_attr: Optional[str] = None # ToolWorker attribute name 

62 # on the tool module; None for 

63 # CPU-only engines that have no 

64 # subprocess worker. 

65 required_package: Optional[str] = None # pip package name that must be 

66 # importable at runtime; None 

67 # for engines whose deps are 

68 # bundled (e.g. piper) or CPU-only 

69 # with no extra deps. 

70 pip_install_plan: Tuple[str, ...] = () # canonical pip-spec list to make 

71 # `required_package` actually 

72 # importable + synth-functional — 

73 # includes transitive deps the 

74 # upstream package may forget to 

75 # declare in its install_requires 

76 # (e.g. chatterbox-tts ships 

77 # `import librosa` in tts.py but 

78 # doesn't list librosa as a hard 

79 # dep, so a no-deps pip install 

80 # leaves a broken package on disk 

81 # that imports far enough for 

82 # find_spec() but blows up on 

83 # actual synthesize calls — see 

84 # ~/Documents/Nunba/logs/probe_ 

85 # chatterbox_turbo.err). Single 

86 # source of truth for the desktop 

87 # installer (Nunba) so it doesn't 

88 # carry a parallel dict that 

89 # drifts. Empty tuple = nothing 

90 # to install (bundled / CPU stub). 

91 install_target: str = 'main' # WHERE pip_install_plan should 

92 # land on the desktop installer. 

93 # Valid values: 

94 # 'main' — into the main 

95 # python-embed 

96 # site-packages 

97 # (legacy default; 

98 # risky, dep 

99 # conflicts mask 

100 # silent failures) 

101 # 'venv' — into a private 

102 # venv at 

103 # ~/Documents/ 

104 # Nunba/data/ 

105 # venvs/<engine>/. 

106 # Requires a 

107 # per-engine 

108 # worker file 

109 # (tts/<engine>_ 

110 # worker.py) that 

111 # the parent 

112 # dispatches into 

113 # via backend_ 

114 # venv.invoke_in_ 

115 # venv(). 

116 # 'bundled' — already on 

117 # disk via the 

118 # frozen build 

119 # (piper voices, 

120 # luxtts, espeak) 

121 # 'cloud' — HTTP-only, 

122 # nothing to 

123 # install 

124 # (makeittalk) 

125 # 'git_clone' — needs git clone 

126 # of an upstream 

127 # repo + pip 

128 # install -e 

129 # (cosyvoice3 → 

130 # FunAudioLLM/ 

131 # CosyVoice) 

132 # Default 'main' preserves 

133 # current behavior; flipping a 

134 # GPU engine to 'venv' requires 

135 # the matching worker file in 

136 # Nunba (or the dispatch falls 

137 # back to in-process import, 

138 # which only works if the engine 

139 # is also installed in main). 

140 sample_rate: int = 24000 

141 

142 

143# Shared pip-spec constants — keep here so the install plans below stay 

144# readable and so a single edit updates every engine that pins them. 

145# 

146# huggingface_hub 0.29+ removes is_offline_mode that transformers <5.x 

147# still imports, so we cap below 0.29 for the chatterbox / kokoro chain. 

148_HF_HUB_PIN = 'huggingface_hub>=0.27.0,<0.29.0' 

149 

150# Chatterbox plan — `chatterbox-tts` on PyPI omits MULTIPLE runtime 

151# imports from its install_requires. Each one only surfaces when the 

152# install proceeds far enough for the next one to be reached: 

153# 

154# chatterbox/__init__.py:9 → from .tts import ChatterboxTTS 

155# chatterbox/tts.py:4 → import librosa (missing #1) 

156# chatterbox/tts.py:6 → import perth (missing #2) 

157# 

158# Each was discovered from a real failed install at 

159# ~/Documents/Nunba/logs/probe_chatterbox_turbo.err on the user's 

160# desktop — first librosa, then once that was added, perth. Listing 

161# them all here means a fresh chatterbox install completes in one 

162# pip pass instead of needing 2-3 self-heal iterations (each of 

163# which downloads ~10 MB of pip metadata). The Nunba self-heal 

164# loop catches future un-declared transitives on the install screen 

165# without surfacing a synth failure to the user. 

166_CHATTERBOX_PIP_PLAN: Tuple[str, ...] = ( 

167 _HF_HUB_PIN, 

168 'torchaudio', 

169 'chatterbox-tts', 

170 'librosa', # missing transitive #1 — chatterbox/tts.py:4 

171 'soundfile', # librosa needs it on Windows for non-WAV outputs 

172 'resemble-perth', # missing transitive #2 — chatterbox/tts.py:6 

173 # `import perth`; PyPI pkg name = resemble-perth 

174 # (the watermark library Resemble AI uses to 

175 # tag synthesized audio). 

176 # NOTE on the rest of chatterbox-tts==0.1.7's requires_dist 

177 # (omegaconf, conformer, pyloudnorm, pykakasi, spacy-pkuseg, 

178 # diffusers, einops, s3tokenizer, etc.): 

179 # We deliberately do NOT pre-install them in one pip pass. 

180 # When pip is asked to install many at once with 

181 # `--no-build-isolation` (frozen build constraint, see 

182 # package_installer._run_pip), and one of the transitives needs 

183 # a source build (omegaconf → antlr4-python3-runtime==4.9.* is 

184 # sdist-only on PyPI), pip's parallel-builds path races against 

185 # the bundle's setuptools and surfaces as 

186 # BackendUnavailable: Cannot import 'setuptools.build_meta' 

187 # (observed 2026-04-28 on the user's bundle f2d4567 — full pip 

188 # invocation aborts rc=2, no transitive gets installed). 

189 # _self_heal_missing_transitives in package_installer.py handles 

190 # them one-at-a-time AFTER the chatterbox-tts top-level install 

191 # — single-package mode never triggers the parallel-build race. 

192 # Combined with the PYTHONNOUSERSITE=1 fix in tts/_torch_probe.py 

193 # (probe no longer leaks system Python's site-packages), each 

194 # heal cycle finds a REAL missing transitive, not a phantom one. 

195 # The original 5-cycle trail (librosa → perth → einops → 

196 # s3tokenizer → omegaconf) is fine because each cycle resolves 

197 # in ~10-30s of single-package pip work, not 5 minutes of 

198 # parallel-build resolver thrash. 

199) 

200 

201 

202# All known TTS engines 

203ENGINE_REGISTRY: Dict[str, TTSEngineSpec] = { 

204 'chatterbox_turbo': TTSEngineSpec( 

205 engine_id='chatterbox_turbo', 

206 device=TTSDevice.GPU_ONLY, 

207 vram_key='tts_chatterbox_turbo', 

208 languages=('en',), 

209 quality=0.95, 

210 voice_clone=True, 

211 latency_gpu_ms=150, 

212 latency_cpu_ms=0, 

213 latency_cloud_ms=0, 

214 tool_module='integrations.service_tools.chatterbox_tool', 

215 tool_function='chatterbox_synthesize', 

216 tool_worker_attr='_turbo', 

217 required_package='chatterbox', 

218 pip_install_plan=_CHATTERBOX_PIP_PLAN, 

219 # chatterbox-tts 0.1.7 hard-pins torch==2.6.0, transformers==5.2.0, 

220 # numpy<2.0.0, diffusers==0.29.0, safetensors==0.5.3 — all in 

221 # direct conflict with HARTOS's main interpreter (torch 2.11, 

222 # transformers 5.1, numpy 2.4, diffusers 0.37, safetensors 0.7). 

223 # Auto-heal can never satisfy these because main-interpreter 

224 # downgrades would break llama-server, indic_parler, faster-whisper, 

225 # and every other ML stack. Quarantine into its own venv — 

226 # same pattern indic_parler uses (parler-tts pinned 

227 # transformers<4.47). Nunba's tts/package_installer.py routes 

228 # the install into ~/.nunba/venvs/chatterbox_turbo/, and the 

229 # HARTOS ToolWorker's python_exe is set to the venv's python 

230 # at runtime via desktop/_wire_venv_engines.py at boot, so the 

231 # synth subprocess sees the pinned chatterbox-compatible deps 

232 # instead of the main interpreter's incompatible newer ones. 

233 install_target='venv', 

234 ), 

235 # luxtts REMOVED — poor audio quality, not suitable for any use case. 

236 'cosyvoice3': TTSEngineSpec( 

237 engine_id='cosyvoice3', 

238 device=TTSDevice.GPU_ONLY, 

239 vram_key='tts_cosyvoice3', 

240 languages=('zh', 'ja', 'ko', 'de', 'es', 'fr', 'it', 'ru', 'en'), 

241 quality=0.92, 

242 voice_clone=True, 

243 latency_gpu_ms=200, 

244 latency_cpu_ms=0, 

245 latency_cloud_ms=0, 

246 tool_module='integrations.service_tools.cosyvoice_tool', 

247 tool_function='cosyvoice_synthesize', 

248 tool_worker_attr='_tool', 

249 required_package='cosyvoice', 

250 # cosyvoice is NOT pip-installable — needs a `git clone` of 

251 # FunAudioLLM/CosyVoice plus model weight download via 

252 # huggingface_hub. Empty plan + install_target='git_clone' 

253 # signals Nunba to skip the pip path entirely and route 

254 # through its git-clone install handler instead. The 

255 # verify-synth probe must also short-circuit on git_clone 

256 # engines when the package isn't importable (current Nunba 

257 # bug: probe runs `import cosyvoice` blindly + always fails). 

258 pip_install_plan=(), 

259 install_target='git_clone', 

260 ), 

261 'f5_tts': TTSEngineSpec( 

262 engine_id='f5_tts', 

263 device=TTSDevice.GPU_ONLY, 

264 vram_key='tts_f5', 

265 languages=('en', 'zh'), 

266 quality=0.91, 

267 voice_clone=True, 

268 latency_gpu_ms=200, 

269 latency_cpu_ms=0, 

270 latency_cloud_ms=0, 

271 tool_module='integrations.service_tools.f5_tts_tool', 

272 tool_function='f5_synthesize', 

273 tool_worker_attr='_tool', 

274 required_package='f5_tts', 

275 pip_install_plan=('torchaudio', 'f5-tts'), 

276 ), 

277 'indic_parler': TTSEngineSpec( 

278 engine_id='indic_parler', 

279 device=TTSDevice.GPU_ONLY, 

280 vram_key='tts_indic_parler', 

281 languages=( 

282 'hi', 'ta', 'te', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ur', 

283 'as', 'bho', 'doi', 'kok', 'mai', 'mni', 'ne', 'sa', 'sat', 'sd', 'en', 

284 ), 

285 quality=0.90, 

286 voice_clone=False, 

287 latency_gpu_ms=300, 

288 latency_cpu_ms=0, 

289 latency_cloud_ms=0, 

290 tool_module='integrations.service_tools.indic_parler_tool', 

291 tool_function='indic_parler_synthesize', 

292 tool_worker_attr='_tool', 

293 required_package='parler_tts', 

294 # Indic Parler quarantines into its own venv on the desktop — 

295 # parler-tts 0.2.2 hard-pins transformers<4.47 which conflicts 

296 # with the main interpreter's transformers 5.1.0. The full 

297 # pip plan lives here so it travels with the engine spec; the 

298 # desktop installer routes the install into the venv when 

299 # install_target='venv'. Worker file: 

300 # tts/indic_parler_worker.py (Nunba). HARTOS server side runs 

301 # Indic Parler in its own subprocess worker so the main 

302 # interpreter pin doesn't apply there either. 

303 pip_install_plan=( 

304 # tqdm + colorama pinned FIRST to stop pip's resolver from 

305 # backtracking through colorama 0.1.x (no setup.py, breaks 

306 # install). Witnessed user-facing failure: 

307 # "Indic Parler TTS unavailable — using fallback voice engine" 

308 # Root-caused from ~/Documents/Nunba/logs/venv_indic_parler.log. 

309 'colorama>=0.4.6', 

310 'tqdm>=4.65', 

311 'transformers==4.46.1', # parler-tts 0.2.2 requires <4.47 

312 'torch', # CPU-ish fallback; replaced by CUDA if GPU 

313 'torchaudio', 

314 'sentencepiece', 

315 'descript-audio-codec', 

316 'parler-tts==0.2.2', # 0.2.3 has DacModel.decode() API mismatch 

317 'soundfile', 

318 _HF_HUB_PIN, 

319 ), 

320 install_target='venv', 

321 ), 

322 'chatterbox_ml': TTSEngineSpec( 

323 engine_id='chatterbox_ml', 

324 device=TTSDevice.GPU_ONLY, 

325 vram_key='tts_chatterbox_ml', 

326 languages=( 

327 'en', 'zh', 'ja', 'ko', 'de', 'es', 'fr', 'it', 'ru', 'pt', 

328 'ar', 'nl', 'pl', 'sv', 'tr', 'hi', 'ta', 'te', 'bn', 'id', 

329 'th', 'vi', 'cs', 

330 ), 

331 quality=0.94, 

332 voice_clone=True, 

333 latency_gpu_ms=300, 

334 latency_cpu_ms=0, 

335 latency_cloud_ms=0, 

336 tool_module='integrations.service_tools.chatterbox_tool', 

337 tool_function='chatterbox_ml_synthesize', 

338 tool_worker_attr='_ml', 

339 required_package='chatterbox', 

340 pip_install_plan=_CHATTERBOX_PIP_PLAN, 

341 ), 

342 'pocket_tts': TTSEngineSpec( 

343 engine_id='pocket_tts', 

344 device=TTSDevice.CPU_ONLY, 

345 vram_key='', 

346 languages=('en',), 

347 quality=0.85, 

348 voice_clone=True, 

349 latency_gpu_ms=0, 

350 latency_cpu_ms=200, 

351 latency_cloud_ms=0, 

352 tool_module='integrations.service_tools.pocket_tts_tool', 

353 tool_function='pocket_tts_synthesize', 

354 pip_install_plan=('pocket-tts',), 

355 ), 

356 # NeuTTS Air — Neuphonic 748M-param Qwen2 backbone with NeuCodec 

357 # decoder, Apache 2.0. GGUF Q4 (~600MB) / Q8 (~800MB). RTF<0.5 

358 # on CPU (Intel i5 / RPi 5), 24kHz output, instant voice cloning 

359 # from 3-15s reference audio. English primary. Slots between 

360 # omnivoice and kokoro on the English ladder per quality 

361 # (kokoro=0.88, neutts=0.91, omnivoice~0.93, chatterbox=0.95). 

362 # 

363 # Reference voice contract: NeuTTS requires a reference audio + 

364 # transcript per call (no built-in 'alba'-style zero-config 

365 # voices). The wrapper resolves 'jo' (upstream sample shipped 

366 # with the package), any path to a .wav with companion .txt, 

367 # or a custom name from ~/.hevolve/models/tts/neutts/voices/. 

368 # See integrations/service_tools/neutts_tool.py for resolution. 

369 'neutts_air': TTSEngineSpec( 

370 engine_id='neutts_air', 

371 device=TTSDevice.GPU_PREFERRED, 

372 vram_key='tts_neutts', 

373 languages=('en',), 

374 quality=0.91, 

375 voice_clone=True, 

376 latency_gpu_ms=150, 

377 latency_cpu_ms=400, 

378 latency_cloud_ms=0, 

379 tool_module='integrations.service_tools.neutts_tool', 

380 tool_function='neutts_synthesize', 

381 # Worker attribute — Nunba's `_SubprocessTTSBackend` needs this 

382 # to drive the subprocess. Without it the spec falls into the 

383 # `_InProcessTTSBackend` path (line ~2408 of Nunba's tts_engine 

384 # .py) which in turn does `import neutts` from the MAIN 

385 # interpreter — a guaranteed ImportError because `install_target 

386 # ='venv'` lands the package in the per-engine venv, not the 

387 # main python-embed. Setting `_tool` here pairs cleanly with 

388 # the ToolWorker singleton in integrations.service_tools.neutts 

389 # _tool, mirroring kokoro / chatterbox / f5 / indic_parler. 

390 tool_worker_attr='_tool', 

391 required_package='neutts', 

392 # `neutts[all]` pulls llama-cpp-python (for GGUF inference) 

393 # plus soundfile + onnxruntime. The base `neutts` package 

394 # alone is not enough for synth — the codec decoder needs 

395 # onnxruntime. Pin huggingface_hub via _HF_HUB_PIN so the 

396 # transformers chain stays consistent with the rest of the 

397 # English ladder (chatterbox / kokoro use the same pin). 

398 pip_install_plan=( 

399 _HF_HUB_PIN, 

400 'neutts[all]', 

401 'soundfile', # explicit — wrapper requires soundfile.write 

402 ), 

403 # Quarantine into its own venv on the desktop installer. 

404 # NeuTTS pulls llama-cpp-python which can drift from the 

405 # main interpreter's torch / numpy stack. Same pattern as 

406 # chatterbox_turbo and indic_parler. 

407 install_target='venv', 

408 ), 

409 # Kokoro 82M — tiny neural English TTS. Runs on CPU (≈1× real-time, 

410 # 200MB RAM) or GPU (≈0.1× real-time, 200MB VRAM). Quality sits 

411 # above Piper and below the big voice-clone engines, so it's the 

412 # right second rung on the English ladder — tried when the GPU 

413 # engines can't run (no CUDA, VRAM full, package missing) but 

414 # BEFORE we fall all the way down to Piper. 

415 # 

416 # Benchmark context (vs piper, on English): 

417 # - quality: kokoro 0.88 vs piper 0.70 (subjective MOS gap) 

418 # - cpu latency: kokoro 400ms vs piper 200ms (per ~10 words) 

419 # - disk: kokoro 160MB vs piper 60MB (per voice) 

420 # - voices: kokoro ~25 vs piper ~15 (per-language catalog) 

421 # Piper still wins on raw CPU speed and disk, which is why it 

422 # stays the absolute last-resort fallback. 

423 'kokoro': TTSEngineSpec( 

424 engine_id='kokoro', 

425 device=TTSDevice.GPU_PREFERRED, 

426 vram_key='tts_kokoro', 

427 languages=('en',), 

428 quality=0.88, 

429 voice_clone=False, 

430 latency_gpu_ms=120, 

431 latency_cpu_ms=400, 

432 latency_cloud_ms=0, 

433 tool_module='integrations.service_tools.kokoro_tool', 

434 tool_function='kokoro_synthesize', 

435 tool_worker_attr='_tool', 

436 required_package='kokoro', 

437 pip_install_plan=( 

438 _HF_HUB_PIN, 

439 'kokoro', # pulls misaki phonemizer transitively 

440 'espeakng', # espeak-ng Python bindings (ships binary on Windows) 

441 ), 

442 ), 

443 # OmniVoice — universal TTS. Qwen3-0.6B backbone + diffusion head, 

444 # 646 languages (581k training hours spanning every Indic script, 

445 # zh/ja/ko, European, Arabic, low-resource). Zero-shot voice cloning 

446 # from 3-10 s of reference audio. Apache 2.0. 

447 # 

448 # Languages tuple is ('*',) — same wildcard convention as espeak — 

449 # but select_engines() only considers engines explicitly listed in 

450 # LANG_ENGINE_PREFERENCE for the resolved language. We prepend 

451 # 'omnivoice' to every Indic + non-English entry + _DEFAULT_PREFERENCE 

452 # so it wins unless it's uninstalled or the GPU can't hold it. 

453 # 

454 # VRAM is stubbed at 3.0 GB in vram_manager.VRAM_BUDGETS; the worker 

455 # self-reports actual usage on first load via '__WORKER_VRAM_GB__' 

456 # and vram_manager.record_actual_usage tightens the budget. 

457 'omnivoice': TTSEngineSpec( 

458 engine_id='omnivoice', 

459 device=TTSDevice.GPU_ONLY, 

460 vram_key='tts_omnivoice', 

461 languages=('*',), # 646 languages 

462 quality=0.93, 

463 voice_clone=True, 

464 latency_gpu_ms=250, 

465 latency_cpu_ms=0, 

466 latency_cloud_ms=0, 

467 tool_module='integrations.service_tools.omnivoice_tool', 

468 tool_function='omnivoice_synthesize', 

469 tool_worker_attr='_tool', 

470 required_package='omnivoice', 

471 # See omnivoice_tool.py docstring: "Requires: pip install 

472 # omnivoice torch soundfile". torch is bundled. 

473 pip_install_plan=('omnivoice', 'soundfile'), 

474 ), 

475 'espeak': TTSEngineSpec( 

476 engine_id='espeak', 

477 device=TTSDevice.CPU_ONLY, 

478 vram_key='', 

479 languages=('*',), # 100+ languages 

480 quality=0.40, 

481 voice_clone=False, 

482 latency_gpu_ms=0, 

483 latency_cpu_ms=10, 

484 latency_cloud_ms=0, 

485 tool_module='integrations.service_tools.pocket_tts_tool', 

486 tool_function='pocket_tts_synthesize', # espeak is fallback inside pocket 

487 install_target='bundled', 

488 ), 

489 'makeittalk': TTSEngineSpec( 

490 engine_id='makeittalk', 

491 device=TTSDevice.CLOUD, 

492 vram_key='', 

493 languages=('en',), 

494 quality=0.88, 

495 voice_clone=False, 

496 latency_gpu_ms=0, 

497 latency_cpu_ms=0, 

498 latency_cloud_ms=5000, 

499 tool_module=None, # Special cloud path in model_bus_service 

500 tool_function=None, 

501 install_target='cloud', 

502 ), 

503 # Piper — bundled CPU engine, multilingual via downloadable voice 

504 # files. Uses ('*',) wildcard (same convention as espeak) so one 

505 # spec covers every language Piper has voices for — no parallel 

506 # per-language list. Runtime synth attempt raises on missing voice 

507 # files and the router falls through to a neural engine. 

508 'piper': TTSEngineSpec( 

509 engine_id='piper', 

510 device=TTSDevice.CPU_ONLY, 

511 vram_key='', 

512 languages=('*',), 

513 quality=0.70, 

514 voice_clone=False, 

515 latency_gpu_ms=0, 

516 latency_cpu_ms=200, 

517 latency_cloud_ms=0, 

518 tool_module=None, # In-process via Nunba tts/piper_tts.py — 

519 # no subprocess worker, no required_package. 

520 tool_function=None, 

521 install_target='bundled', 

522 ), 

523 # ── Mid-VRAM coverage tier (1–3 GB) ─────────────────────────── 

524 # These three engines fill the gap so every SUPPORTED_LANG_DICT 

525 # code has at least one engine with vram_gb≤3.0 in its preference 

526 # ladder. Indic Parler (2.0) + F5 (2.5) cover en/zh + 22 Indic; 

527 # the trio below adds the rest of the major language families 

528 # without forcing users onto the 12-14 GB Chatterbox-ML or the 

529 # uninstallable git-clone CosyVoice path. 

530 'melotts': TTSEngineSpec( 

531 engine_id='melotts', 

532 device=TTSDevice.GPU_PREFERRED, # works on CPU at real-time too 

533 vram_key='tts_melotts', 

534 languages=('en', 'es', 'fr', 'zh', 'ja', 'ko'), 

535 quality=0.86, 

536 voice_clone=False, 

537 latency_gpu_ms=180, 

538 latency_cpu_ms=600, 

539 latency_cloud_ms=0, 

540 tool_module='integrations.service_tools.melotts_tool', 

541 tool_function='melotts_synthesize', 

542 tool_worker_attr='_tool', 

543 required_package='melo', # `from melo.api import TTS` 

544 pip_install_plan=( 

545 _HF_HUB_PIN, 

546 'melotts', # PyPI package; ships `melo` import root 

547 'soundfile', # used for duration probe 

548 ), 

549 ), 

550 'xtts_v2': TTSEngineSpec( 

551 engine_id='xtts_v2', 

552 device=TTSDevice.GPU_ONLY, 

553 vram_key='tts_xtts_v2', 

554 languages=( 

555 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 

556 'cs', 'ar', 'zh', 'hu', 'ko', 'ja', 'hi', 

557 ), 

558 quality=0.92, 

559 voice_clone=True, 

560 latency_gpu_ms=350, 

561 latency_cpu_ms=0, 

562 latency_cloud_ms=0, 

563 tool_module='integrations.service_tools.xtts_tool', 

564 tool_function='xtts_synthesize', 

565 tool_worker_attr='_tool', 

566 required_package='TTS', # `from TTS.api import TTS` 

567 pip_install_plan=( 

568 _HF_HUB_PIN, 

569 'coqui-tts', # idiap-maintained 2026 fork on PyPI; 

570 # ships `from TTS.api import TTS` so 

571 # the import path is stable. 

572 'soundfile', 

573 ), 

574 ), 

575 'mms_tts': TTSEngineSpec( 

576 engine_id='mms_tts', 

577 device=TTSDevice.GPU_PREFERRED, # CPU works, GPU faster 

578 vram_key='tts_mms_tts', 

579 languages=( 

580 # Roman-script languages where mms_tts_tool routes without 

581 # uroman. Non-Roman scripts (ar/hi/zh/ko/ja/...) ALSO have 

582 # mms-tts checkpoints but require uroman pre-processing — 

583 # the tool gracefully fails when uroman isn't installed and 

584 # the router falls through to the next preference. We list 

585 # the broader set here because the tool decides per-call 

586 # whether it can serve; the router's job is to attempt. 

587 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 

588 'cs', 'hu', 'sv', 'fi', 'el', 'ro', 'bg', 'uk', 'cy', 'is', 

589 'zh', 'ja', 'ko', 'vi', 'th', 'id', 'ms', 'km', 'lo', 'my', 

590 'hi', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'or', 

591 'ne', 'as', 'sd', 'sa', 'ur', 'si', 

592 'ar', 'fa', 'he', 'sw', 

593 ), 

594 quality=0.78, 

595 voice_clone=False, 

596 latency_gpu_ms=200, 

597 latency_cpu_ms=500, 

598 latency_cloud_ms=0, 

599 tool_module='integrations.service_tools.mms_tts_tool', 

600 tool_function='mms_tts_synthesize', 

601 tool_worker_attr='_tool', 

602 required_package='transformers', # already bundled — no install plan 

603 pip_install_plan=( 

604 _HF_HUB_PIN, 

605 'soundfile', # for WAV write 

606 # uroman is OPTIONAL — only needed for non-Roman scripts. 

607 # The tool falls through cleanly when missing, so we don't 

608 # bundle the perl repo + extra pip dep into every install. 

609 # Users who want broad Indic/Arabic/CJK coverage from MMS 

610 # specifically can `pip install uroman` separately. 

611 ), 

612 ), 

613} 

614 

615 

616# ═══════════════════════════════════════════════════════════════ 

617# Language → Engine Preference Table 

618# ═══════════════════════════════════════════════════════════════ 

619 

620# Ordered by quality for each language — first available wins 

621LANG_ENGINE_PREFERENCE: Dict[str, List[str]] = { 

622 # English ladder (quality-then-speed): 

623 # 1. chatterbox_turbo — big GPU voice-clone, highest quality 

624 # 2. kokoro — 82M neural, CPU-friendly, best non-GPU quality 

625 # 3. pocket_tts — small cloneable fallback 

626 # 4. cosyvoice3 — big multilingual GPU, usable for EN 

627 # 5. piper — bundled CPU fallback, always ships 

628 # 6. espeak — absolute last-resort phoneme synth 

629 # luxtts dropped from default ladder (poor naturalness); still available 

630 # for explicit voice-clone requests via direct engine selection. 

631 # chatterbox_turbo wins on English quality; omnivoice sits above 

632 # kokoro/pocket/cosyvoice for cross-engine consistency when the 

633 # user also runs non-English traffic and we want to avoid swapping 

634 # engines on every language switch. 

635 # neutts_air slotted between omnivoice and melotts: 

636 # - quality: chatterbox_turbo=0.95 > omnivoice~0.93 > neutts=0.91 > kokoro=0.88 

637 # - cpu RTF: neutts <0.5 (acceptable on CPU for 8GB+ machines) 

638 # - install: pip neutts[all] (GGUF Q4 ~600MB), Apache 2.0 

639 # Behavior on missing package: wrapper returns clean {error: ...} 

640 # JSON; ladder traverses to next engine — verified against 

641 # tts/package_installer.py per-engine independent install 

642 # contract + tts_engine._synthesize_with_fallback ladder walk. 

643 'en': ['chatterbox_turbo', 'omnivoice', 'neutts_air', 'melotts', 'xtts_v2', 'kokoro', 'pocket_tts', 'cosyvoice3', 'mms_tts', 'piper', 'espeak'], 

644 # Indic languages — omnivoice replaces indic_parler as the primary 

645 # (parler kept as fallback for one release cycle). OmniVoice has 

646 # 100-400 training hours per major Indic language vs parler's ~10, 

647 # and adds voice cloning which parler lacks entirely. XTTS-v2 

648 # adds Hindi (only); MMS-TTS adds the rest as 1 GB-tier coverage. 

649 'hi': ['omnivoice', 'indic_parler', 'xtts_v2', 'chatterbox_ml', 'cosyvoice3', 'mms_tts', 'espeak'], 

650 'ta': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

651 'te': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

652 'bn': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

653 'gu': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

654 'kn': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

655 'ml': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

656 'mr': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

657 'or': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

658 'pa': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

659 'ur': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

660 'as': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

661 'ne': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

662 'sa': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], 

663 'si': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], # Sinhala — mms-tts adds 1 GB-tier 

664 'sd': ['omnivoice', 'indic_parler', 'chatterbox_ml', 'mms_tts', 'espeak'], # Sindhi — Indic Parler + mms 

665 # CJK — omnivoice has 500k+ hours of CJK in training; promote over cosyvoice. 

666 # MeloTTS slots above the heavy Chatterbox-ML for the 1.5 GB tier. 

667 'zh': ['omnivoice', 'melotts', 'cosyvoice3', 'f5_tts', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

668 'ja': ['omnivoice', 'melotts', 'cosyvoice3', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

669 'ko': ['omnivoice', 'melotts', 'cosyvoice3', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

670 # European — XTTS-v2 (2.5 GB, voice clone) and MeloTTS (1.5 GB) 

671 # slot above the 12 GB Chatterbox-ML so users on 4-8 GB GPUs get 

672 # quality TTS without the 14 GB allocation that pushes other 

673 # workers off the GPU. 

674 'de': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'], 

675 'es': ['omnivoice', 'melotts', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'], 

676 'fr': ['omnivoice', 'melotts', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'], 

677 'it': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'], 

678 'ru': ['omnivoice', 'xtts_v2', 'cosyvoice3', 'chatterbox_ml', 'mms_tts', 'espeak'], 

679 'pt': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

680 'ar': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

681 'nl': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

682 'pl': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

683 'sv': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], 

684 'tr': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

685 'id': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], 

686 'th': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], 

687 'vi': ['omnivoice', 'chatterbox_ml', 'mms_tts', 'espeak'], 

688 'cs': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

689 # Newly-covered SUPPORTED_LANG_DICT entries — these had no 

690 # explicit ladder before and would have hit _DEFAULT_PREFERENCE 

691 # (omnivoice → chatterbox_ml → espeak), where chatterbox_ml needs 

692 # 14 GB. MMS-TTS at 1 GB now provides the always-runnable fallback. 

693 'hu': ['omnivoice', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

694 'el': ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'], 

695 'fi': ['omnivoice', 'mms_tts', 'espeak'], 

696 'ro': ['omnivoice', 'mms_tts', 'espeak'], 

697 'bg': ['omnivoice', 'mms_tts', 'espeak'], 

698 'uk': ['omnivoice', 'mms_tts', 'espeak'], 

699 'cy': ['omnivoice', 'mms_tts', 'espeak'], # Welsh 

700 'is': ['omnivoice', 'mms_tts', 'espeak'], # Icelandic 

701 'ms': ['omnivoice', 'mms_tts', 'espeak'], # Malay 

702 'fa': ['omnivoice', 'mms_tts', 'espeak'], # Persian (uroman) 

703 'he': ['omnivoice', 'mms_tts', 'espeak'], # Hebrew (uroman) 

704 'sw': ['omnivoice', 'mms_tts', 'espeak'], # Swahili 

705 'km': ['omnivoice', 'mms_tts', 'espeak'], # Khmer (uroman) 

706 'lo': ['omnivoice', 'mms_tts', 'espeak'], # Lao (uroman) 

707 'my': ['omnivoice', 'mms_tts', 'espeak'], # Burmese (uroman) 

708 # Additional Indic codes that exist in SUPPORTED_LANG_DICT but 

709 # weren't in the language preference table previously — these 

710 # ride Indic Parler's 22-language coverage, then mms_tts. 

711 'brx': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Bodo 

712 'doi': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Dogri 

713 'kok': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Konkani 

714 'mai': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Maithili 

715 'mni': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Manipuri 

716 'sat': ['omnivoice', 'indic_parler', 'mms_tts', 'espeak'], # Santali 

717 'ks': ['omnivoice', 'mms_tts', 'espeak'], # Kashmiri 

718 # Misc that were previously routed via _DEFAULT_PREFERENCE only. 

719 'lv': ['omnivoice', 'mms_tts', 'espeak'], # Latvian 

720 'sr': ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'], # Serbian 

721 'zh-cn': ['omnivoice', 'melotts', 'cosyvoice3', 'f5_tts', 'xtts_v2', 'chatterbox_ml', 'mms_tts', 'espeak'], 

722} 

723 

724# Fallback for unlisted languages — omnivoice covers 646 + mms_tts covers 

725# 1100+, so this is reached only when both are uninstalled / can't fit. 

726# chatterbox_ml is the heaviest local clone, espeak is the absolute floor. 

727_DEFAULT_PREFERENCE = ['omnivoice', 'mms_tts', 'chatterbox_ml', 'espeak'] 

728 

729 

730# ═══════════════════════════════════════════════════════════════ 

731# Route result 

732# ═══════════════════════════════════════════════════════════════ 

733 

734class TTSLocation(Enum): 

735 LOCAL = "local" 

736 HIVE_PEER = "hive_peer" 

737 CLOUD = "cloud" 

738 

739 

740@dataclass 

741class TTSCandidate: 

742 """A scored TTS engine candidate.""" 

743 engine: TTSEngineSpec 

744 location: TTSLocation 

745 device: str # 'gpu', 'cpu', 'cloud' 

746 estimated_latency_ms: int 

747 quality_score: float 

748 peer_address: Optional[str] = None # if location == HIVE_PEER 

749 warnings: List[str] = field(default_factory=list) 

750 

751 

752@dataclass 

753class TTSResult: 

754 """Result of a TTS synthesis.""" 

755 path: str 

756 duration: float 

757 engine_id: str 

758 device: str 

759 location: str 

760 latency_ms: float 

761 sample_rate: int 

762 voice: str 

763 quality_score: float 

764 warnings: List[str] = field(default_factory=list) 

765 error: Optional[str] = None 

766 

767 def to_dict(self) -> Dict[str, Any]: 

768 d = { 

769 'path': self.path, 

770 'duration': self.duration, 

771 'engine': self.engine_id, 

772 'device': self.device, 

773 'location': self.location, 

774 'latency_ms': self.latency_ms, 

775 'sample_rate': self.sample_rate, 

776 'voice': self.voice, 

777 'quality_score': self.quality_score, 

778 } 

779 if self.warnings: 

780 d['warnings'] = self.warnings 

781 if self.error: 

782 d['error'] = self.error 

783 return d 

784 

785 

786# ═══════════════════════════════════════════════════════════════ 

787# Language Detection 

788# ═══════════════════════════════════════════════════════════════ 

789 

790def detect_language(text: str) -> str: 

791 """Detect language of text. Returns ISO 639-1 code (e.g. 'en', 'hi'). 

792 

793 Uses langdetect if available, falls back to heuristics. 

794 """ 

795 if not text or not text.strip(): 

796 return 'en' 

797 try: 

798 from langdetect import detect 

799 return detect(text) 

800 except ImportError: 

801 pass 

802 except Exception: 

803 pass 

804 

805 # Heuristic fallback: check Unicode script ranges 

806 sample = text[:500] 

807 devanagari = sum(1 for c in sample if '\u0900' <= c <= '\u097F') 

808 cjk = sum(1 for c in sample if '\u4E00' <= c <= '\u9FFF') 

809 hangul = sum(1 for c in sample if '\uAC00' <= c <= '\uD7AF') 

810 katakana = sum(1 for c in sample if '\u30A0' <= c <= '\u30FF') 

811 hiragana = sum(1 for c in sample if '\u3040' <= c <= '\u309F') 

812 tamil = sum(1 for c in sample if '\u0B80' <= c <= '\u0BFF') 

813 telugu = sum(1 for c in sample if '\u0C00' <= c <= '\u0C7F') 

814 arabic = sum(1 for c in sample if '\u0600' <= c <= '\u06FF') 

815 cyrillic = sum(1 for c in sample if '\u0400' <= c <= '\u04FF') 

816 bengali = sum(1 for c in sample if '\u0980' <= c <= '\u09FF') 

817 gujarati = sum(1 for c in sample if '\u0A80' <= c <= '\u0AFF') 

818 kannada = sum(1 for c in sample if '\u0C80' <= c <= '\u0CFF') 

819 malayalam = sum(1 for c in sample if '\u0D00' <= c <= '\u0D7F') 

820 

821 threshold = max(3, len(sample) * 0.1) 

822 if devanagari > threshold: 

823 return 'hi' 

824 if tamil > threshold: 

825 return 'ta' 

826 if telugu > threshold: 

827 return 'te' 

828 if bengali > threshold: 

829 return 'bn' 

830 if gujarati > threshold: 

831 return 'gu' 

832 if kannada > threshold: 

833 return 'kn' 

834 if malayalam > threshold: 

835 return 'ml' 

836 if cjk > threshold: 

837 return 'zh' 

838 if hangul > threshold: 

839 return 'ko' 

840 if (katakana + hiragana) > threshold: 

841 return 'ja' 

842 if arabic > threshold: 

843 return 'ar' 

844 if cyrillic > threshold: 

845 return 'ru' 

846 return 'en' 

847 

848 

849# ═══════════════════════════════════════════════════════════════ 

850# Engine Availability Detection 

851# ═══════════════════════════════════════════════════════════════ 

852 

853# Cache for engine availability (avoid repeated import checks) 

854_engine_available_cache: Dict[str, Tuple[bool, float]] = {} 

855_CACHE_TTL = 60.0 # seconds 

856 

857 

858def _is_engine_installed(engine_id: str) -> bool: 

859 """Check if a TTS engine's Python package is available. 

860 

861 TODO REFACTOR: move to model_catalog as ModelEntry.is_installed() — 

862 a model that isn't pip-importable shouldn't be selectable by any caller. 

863 """ 

864 now = time.time() 

865 cached = _engine_available_cache.get(engine_id) 

866 if cached and (now - cached[1]) < _CACHE_TTL: 

867 return cached[0] 

868 

869 spec = ENGINE_REGISTRY.get(engine_id) 

870 if not spec or not spec.tool_module: 

871 _engine_available_cache[engine_id] = (False, now) 

872 return False 

873 

874 available = False 

875 try: 

876 if engine_id == 'espeak': 

877 # espeak availability checked via shutil 

878 import shutil 

879 available = shutil.which('espeak-ng') is not None or shutil.which('espeak') is not None 

880 elif engine_id == 'pocket_tts': 

881 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize # noqa: F401 

882 available = True 

883 elif engine_id == 'luxtts': 

884 from integrations.service_tools.luxtts_tool import luxtts_synthesize # noqa: F401 

885 available = True 

886 elif engine_id == 'cosyvoice3': 

887 from integrations.service_tools.cosyvoice_tool import cosyvoice_synthesize # noqa: F401 

888 available = True 

889 elif engine_id == 'indic_parler': 

890 from integrations.service_tools.indic_parler_tool import indic_parler_synthesize # noqa: F401 

891 available = True 

892 elif engine_id in ('chatterbox_turbo', 'chatterbox_ml'): 

893 from integrations.service_tools.chatterbox_tool import chatterbox_synthesize # noqa: F401 

894 available = True 

895 elif engine_id == 'f5_tts': 

896 from integrations.service_tools.f5_tts_tool import f5_synthesize # noqa: F401 

897 available = True 

898 elif engine_id == 'kokoro': 

899 from integrations.service_tools.kokoro_tool import kokoro_synthesize # noqa: F401 

900 available = True 

901 elif engine_id == 'melotts': 

902 # `melotts` PyPI package ships the `melo` import root. 

903 import importlib.util as _ils 

904 available = _ils.find_spec('melo') is not None 

905 elif engine_id == 'xtts_v2': 

906 # `coqui-tts` PyPI package ships `from TTS.api import TTS`. 

907 import importlib.util as _ils 

908 available = _ils.find_spec('TTS') is not None 

909 elif engine_id == 'mms_tts': 

910 # transformers is bundled; check the VitsModel symbol so we 

911 # detect outright-broken transformers installs early. 

912 from transformers import VitsModel # noqa: F401 

913 available = True 

914 elif engine_id == 'makeittalk': 

915 import os 

916 available = bool(os.environ.get('MAKEITTALK_API_URL')) 

917 except (ImportError, Exception): 

918 available = False 

919 

920 _engine_available_cache[engine_id] = (available, now) 

921 return available 

922 

923 

924def _get_gpu_info() -> Dict[str, Any]: 

925 """Get GPU info from VRAMManager (cached singleton).""" 

926 try: 

927 from integrations.service_tools.vram_manager import get_vram_manager 

928 mgr = get_vram_manager() 

929 return mgr.detect_gpu() 

930 except (ImportError, Exception): 

931 return {'cuda_available': False, 'total_gb': 0, 'free_gb': 0} 

932 

933 

934def _can_fit_on_gpu(engine_id: str) -> bool: # TODO REFACTOR: remove — duplicates catalog.matches_compute() 

935 """Check if this engine's model fits in available VRAM.""" 

936 spec = ENGINE_REGISTRY.get(engine_id) 

937 if not spec or not spec.vram_key: 

938 return False 

939 try: 

940 from integrations.service_tools.vram_manager import get_vram_manager 

941 return get_vram_manager().can_fit(spec.vram_key) 

942 except (ImportError, Exception): 

943 return False 

944 

945 

946def _get_compute_policy() -> Dict[str, Any]: 

947 """Get user's compute policy (local_only / local_preferred / any).""" 

948 try: 

949 from integrations.agent_engine.compute_config import get_compute_policy 

950 return get_compute_policy() 

951 except (ImportError, Exception): 

952 return {'compute_policy': 'local_preferred'} 

953 

954 

955# ═══════════════════════════════════════════════════════════════ 

956# Hive Peer TTS Offload 

957# ═══════════════════════════════════════════════════════════════ 

958 

959def _find_hive_peer_for_tts(language: str) -> Optional[Dict[str, Any]]: 

960 # TODO REFACTOR: move to orchestrator as find_peer_for(model_type, language) — 

961 # hive peer offloading applies to all model types (STT, VLM, LLM), not just TTS. 

962 """Find a hive peer with GPU that can serve TTS for this language. 

963 

964 Returns peer info dict or None. 

965 """ 

966 try: 

967 from integrations.agent_engine.compute_mesh_service import get_compute_mesh 

968 mesh = get_compute_mesh() 

969 if not mesh or not mesh.peers: 

970 return None 

971 

972 for peer in mesh.peers.values(): 

973 if not peer.available_compute or peer.available_compute < 0.1: 

974 continue 

975 # Peer has GPU and capacity 

976 caps = peer.capabilities or {} 

977 if caps.get('gpu'): 

978 return { 

979 'peer_id': peer.peer_id, 

980 'address': peer.address, 

981 'latency_ms': peer.latency_ms or 500, 

982 'gpu': caps.get('gpu', 'unknown'), 

983 } 

984 return None 

985 except (ImportError, Exception): 

986 return None 

987 

988 

989def _offload_tts_to_peer(peer: Dict, text: str, language: str, 

990 voice: Optional[str] = None) -> Optional[Dict]: 

991 """Offload TTS synthesis to a hive peer via compute mesh (DRY — reuses mesh service).""" 

992 try: 

993 from integrations.agent_engine.compute_mesh_service import get_compute_mesh 

994 mesh = get_compute_mesh() 

995 if not mesh: 

996 return None 

997 result = mesh.offload_to_best_peer( 

998 model_type='tts', 

999 prompt=text, 

1000 options={'language': language, 'voice': voice or 'default'}, 

1001 ) 

1002 if result and 'error' not in result: 

1003 return result 

1004 except (ImportError, Exception) as e: 

1005 logger.debug("Hive TTS offload failed: %s", e) 

1006 return None 

1007 

1008 

1009# ═══════════════════════════════════════════════════════════════ 

1010# TTSRouter — the brain 

1011# ═══════════════════════════════════════════════════════════════ 

1012 

1013class TTSRouter: 

1014 """Smart TTS engine selector and dispatcher. 

1015 

1016 Considers language, hardware, compute policy, latency, and hive peers 

1017 to select the best engine for each synthesis request. 

1018 """ 

1019 

1020 def select_engines( # TODO REFACTOR: remove — catalog.select_best() is the single selector. 

1021 # Language preferences feed into catalog via populate_tts_catalog()'s language_priority. 

1022 # Move _is_engine_installed() to catalog, _find_hive_peer to orchestrator. 

1023 self, 

1024 text: str, 

1025 language: Optional[str] = None, 

1026 voice: Optional[str] = None, 

1027 urgency: str = 'normal', 

1028 require_clone: bool = False, 

1029 ) -> List[TTSCandidate]: 

1030 """Select and rank TTS engines for the given request. 

1031 

1032 Args: 

1033 text: Text to synthesize 

1034 language: ISO 639-1 code (auto-detected if None) 

1035 voice: Voice reference (triggers clone-capable filter) 

1036 urgency: 'instant' (fastest), 'normal', 'quality' (best quality) 

1037 require_clone: Only return engines with voice cloning 

1038 

1039 Returns: 

1040 Ranked list of TTSCandidate (best first), never empty 

1041 """ 

1042 # Step 1: Detect language 

1043 lang = language or detect_language(text) 

1044 lang = lang[:2].lower() # normalize to 2-char code 

1045 

1046 # Step 2: Get preferred engines for this language 

1047 preferred = LANG_ENGINE_PREFERENCE.get(lang, _DEFAULT_PREFERENCE) 

1048 

1049 # Step 3: Gather constraints 

1050 gpu_info = _get_gpu_info() 

1051 has_gpu = gpu_info.get('cuda_available', False) 

1052 policy = _get_compute_policy() 

1053 compute_mode = policy.get('compute_policy', 'local_preferred') 

1054 

1055 # Step 4: Score each candidate 

1056 candidates: List[TTSCandidate] = [] 

1057 seen = set() 

1058 

1059 for engine_id in preferred: 

1060 if engine_id in seen: 

1061 continue 

1062 seen.add(engine_id) 

1063 

1064 spec = ENGINE_REGISTRY.get(engine_id) 

1065 if not spec: 

1066 continue 

1067 

1068 # Voice cloning filter 

1069 if require_clone and not spec.voice_clone: 

1070 continue 

1071 

1072 warnings: List[str] = [] 

1073 

1074 # --- LOCAL availability --- 

1075 if spec.device == TTSDevice.CLOUD: 

1076 # Cloud engines: skip if local_only 

1077 if compute_mode == 'local_only': 

1078 continue 

1079 if _is_engine_installed(engine_id): 

1080 candidates.append(TTSCandidate( 

1081 engine=spec, 

1082 location=TTSLocation.CLOUD, 

1083 device='cloud', 

1084 estimated_latency_ms=spec.latency_cloud_ms, 

1085 quality_score=spec.quality, 

1086 )) 

1087 continue 

1088 

1089 if spec.device == TTSDevice.GPU_ONLY: 

1090 if has_gpu and _can_fit_on_gpu(engine_id): 

1091 if _is_engine_installed(engine_id): 

1092 candidates.append(TTSCandidate( 

1093 engine=spec, 

1094 location=TTSLocation.LOCAL, 

1095 device='gpu', 

1096 estimated_latency_ms=spec.latency_gpu_ms, 

1097 quality_score=spec.quality, 

1098 )) 

1099 continue 

1100 

1101 # GPU engine not available locally — try hive peer 

1102 if compute_mode != 'local_only': 

1103 peer = _find_hive_peer_for_tts(lang) 

1104 if peer: 

1105 candidates.append(TTSCandidate( 

1106 engine=spec, 

1107 location=TTSLocation.HIVE_PEER, 

1108 device='gpu', 

1109 estimated_latency_ms=spec.latency_gpu_ms + peer['latency_ms'], 

1110 quality_score=spec.quality * 0.95, # slight penalty for network 

1111 peer_address=peer['address'], 

1112 warnings=[f"Offloaded to hive peer {peer['peer_id']}"], 

1113 )) 

1114 continue 

1115 

1116 if spec.device == TTSDevice.GPU_PREFERRED: 

1117 if not _is_engine_installed(engine_id): 

1118 continue 

1119 if has_gpu and _can_fit_on_gpu(engine_id): 

1120 candidates.append(TTSCandidate( 

1121 engine=spec, 

1122 location=TTSLocation.LOCAL, 

1123 device='gpu', 

1124 estimated_latency_ms=spec.latency_gpu_ms, 

1125 quality_score=spec.quality, 

1126 )) 

1127 else: 

1128 # CPU fallback 

1129 candidates.append(TTSCandidate( 

1130 engine=spec, 

1131 location=TTSLocation.LOCAL, 

1132 device='cpu', 

1133 estimated_latency_ms=spec.latency_cpu_ms, 

1134 quality_score=spec.quality * 0.9, # CPU quality slightly lower 

1135 warnings=['Running on CPU (slower, install GPU for better perf)'], 

1136 )) 

1137 continue 

1138 

1139 if spec.device == TTSDevice.CPU_ONLY: 

1140 if _is_engine_installed(engine_id): 

1141 candidates.append(TTSCandidate( 

1142 engine=spec, 

1143 location=TTSLocation.LOCAL, 

1144 device='cpu', 

1145 estimated_latency_ms=spec.latency_cpu_ms, 

1146 quality_score=spec.quality, 

1147 )) 

1148 continue 

1149 

1150 # Step 5: Always ensure espeak as ultimate fallback 

1151 if not any(c.engine.engine_id == 'espeak' for c in candidates): 

1152 espeak_spec = ENGINE_REGISTRY['espeak'] 

1153 candidates.append(TTSCandidate( 

1154 engine=espeak_spec, 

1155 location=TTSLocation.LOCAL, 

1156 device='cpu', 

1157 estimated_latency_ms=10, 

1158 quality_score=espeak_spec.quality, 

1159 warnings=['Fallback: no neural TTS available for this language'], 

1160 )) 

1161 

1162 # Step 6: Sort by urgency-weighted score 

1163 if urgency == 'instant': 

1164 # Minimize latency — instant response 

1165 candidates.sort(key=lambda c: (c.estimated_latency_ms, -c.quality_score)) 

1166 elif urgency == 'quality': 

1167 # Maximize quality — don't care about latency 

1168 candidates.sort(key=lambda c: (-c.quality_score, c.estimated_latency_ms)) 

1169 else: 

1170 # Balance: quality * 0.6 + inverse_latency * 0.4 

1171 max_latency = max(c.estimated_latency_ms for c in candidates) or 1 

1172 candidates.sort(key=lambda c: -( 

1173 c.quality_score * 0.6 + 

1174 (1 - c.estimated_latency_ms / max_latency) * 0.4 

1175 )) 

1176 

1177 return candidates 

1178 

1179 def synthesize( 

1180 self, 

1181 text: str, 

1182 language: Optional[str] = None, 

1183 voice: Optional[str] = None, 

1184 output_path: Optional[str] = None, 

1185 source: Optional[str] = None, 

1186 urgency: str = 'normal', 

1187 engine_override: Optional[str] = None, 

1188 ) -> TTSResult: 

1189 """Synthesize text using the best available TTS engine. 

1190 

1191 Tries engines in ranked order until one succeeds. 

1192 

1193 Args: 

1194 text: Text to synthesize 

1195 language: ISO 639-1 code (auto-detected if None) 

1196 voice: Voice reference for cloning (path or saved name) 

1197 output_path: Where to write WAV (auto-generated if None) 

1198 source: Context hint (e.g. 'chat_response', 'greeting') — 

1199 auto-maps to urgency via SOURCE_URGENCY 

1200 urgency: 'instant' | 'normal' | 'quality' (used if source not set) 

1201 engine_override: Force a specific engine (bypasses selection) 

1202 

1203 Returns: 

1204 TTSResult with synthesis details 

1205 """ 

1206 # Auto-infer urgency from source hint 

1207 if source: 

1208 urgency = SOURCE_URGENCY.get(source, urgency) 

1209 if not text or not text.strip(): 

1210 return TTSResult( 

1211 path='', duration=0, engine_id='none', device='none', 

1212 location='none', latency_ms=0, sample_rate=0, voice='', 

1213 quality_score=0, error='Text is required', 

1214 ) 

1215 

1216 lang = language or detect_language(text) 

1217 

1218 # Normalize numbers, currency, URLs, units to spoken form BEFORE 

1219 # engine selection — every TTS engine benefits (single converging 

1220 # path). Latency-sensitive ('instant' urgency) skips the LLM 

1221 # fallback but keeps the fast rule pass. 

1222 try: 

1223 from integrations.channels.media.tts_text_normalizer import ( 

1224 normalize_for_tts, 

1225 ) 

1226 text = normalize_for_tts( 

1227 text, lang, use_llm=(urgency != 'instant'), 

1228 ) 

1229 except Exception as _e: # never let normalization block synthesis 

1230 logger.debug(f'tts normalization skipped: {_e}') 

1231 

1232 require_clone = voice is not None and voice not in ('default', '', None) 

1233 

1234 # Engine override 

1235 if engine_override and engine_override in ENGINE_REGISTRY: 

1236 spec = ENGINE_REGISTRY[engine_override] 

1237 candidates = [TTSCandidate( 

1238 engine=spec, 

1239 location=TTSLocation.LOCAL, 

1240 device='gpu' if spec.device in (TTSDevice.GPU_ONLY, TTSDevice.GPU_PREFERRED) else 'cpu', 

1241 estimated_latency_ms=spec.latency_gpu_ms or spec.latency_cpu_ms, 

1242 quality_score=spec.quality, 

1243 )] 

1244 else: 

1245 candidates = self.select_engines( 

1246 text, lang, voice, urgency, require_clone, 

1247 ) 

1248 

1249 # Try each candidate in order 

1250 all_warnings = [] 

1251 for candidate in candidates: 

1252 t0 = time.time() 

1253 try: 

1254 result = self._execute(candidate, text, lang, voice, output_path) 

1255 elapsed = (time.time() - t0) * 1000 

1256 if result and not result.get('error'): 

1257 all_warnings.extend(candidate.warnings) 

1258 return TTSResult( 

1259 path=result.get('path', ''), 

1260 duration=result.get('duration', 0), 

1261 engine_id=candidate.engine.engine_id, 

1262 device=candidate.device, 

1263 location=candidate.location.value, 

1264 latency_ms=round(elapsed, 1), 

1265 sample_rate=result.get('sample_rate', candidate.engine.sample_rate), 

1266 voice=result.get('voice', voice or 'default'), 

1267 quality_score=candidate.quality_score, 

1268 warnings=all_warnings, 

1269 ) 

1270 else: 

1271 err = result.get('error', 'unknown') if result else 'no result' 

1272 all_warnings.append( 

1273 f"{candidate.engine.engine_id} failed: {err}" 

1274 ) 

1275 except Exception as e: 

1276 all_warnings.append(f"{candidate.engine.engine_id} error: {e}") 

1277 logger.debug("TTS engine %s failed: %s", candidate.engine.engine_id, e) 

1278 

1279 # All engines failed 

1280 return TTSResult( 

1281 path='', duration=0, engine_id='none', device='none', 

1282 location='none', latency_ms=0, sample_rate=0, voice='', 

1283 quality_score=0, warnings=all_warnings, 

1284 error='All TTS engines failed', 

1285 ) 

1286 

1287 def _execute( 

1288 self, candidate: TTSCandidate, text: str, 

1289 language: str, voice: Optional[str], output_path: Optional[str], 

1290 ) -> Optional[Dict[str, Any]]: 

1291 """Execute TTS on a specific candidate engine.""" 

1292 

1293 # Hive peer offload 

1294 if candidate.location == TTSLocation.HIVE_PEER: 

1295 peer_info = { 

1296 'address': candidate.peer_address, 

1297 'peer_id': 'hive', 

1298 'latency_ms': candidate.estimated_latency_ms, 

1299 } 

1300 result = _offload_tts_to_peer(peer_info, text, language, voice) 

1301 return result 

1302 

1303 # Cloud (MakeItTalk) 

1304 if candidate.location == TTSLocation.CLOUD: 

1305 return self._execute_makeittalk(text, voice) 

1306 

1307 # Local engine 

1308 engine_id = candidate.engine.engine_id 

1309 spec = candidate.engine 

1310 

1311 if engine_id == 'luxtts': 

1312 return self._call_luxtts(text, voice, output_path, candidate.device) 

1313 elif engine_id == 'pocket_tts': 

1314 return self._call_pocket_tts(text, voice, output_path) 

1315 elif engine_id == 'espeak': 

1316 return self._call_espeak(text, language, output_path) 

1317 elif engine_id == 'cosyvoice3': 

1318 return self._call_gpu_engine( 

1319 'integrations.service_tools.cosyvoice_tool', 

1320 'cosyvoice_synthesize', 

1321 text, language, voice, output_path, 

1322 ) 

1323 elif engine_id == 'indic_parler': 

1324 return self._call_gpu_engine( 

1325 'integrations.service_tools.indic_parler_tool', 

1326 'indic_parler_synthesize', 

1327 text, language, voice, output_path, 

1328 ) 

1329 elif engine_id == 'chatterbox_turbo': 

1330 return self._call_gpu_engine( 

1331 'integrations.service_tools.chatterbox_tool', 

1332 'chatterbox_synthesize', 

1333 text, language, voice, output_path, 

1334 ) 

1335 elif engine_id == 'chatterbox_ml': 

1336 return self._call_gpu_engine( 

1337 'integrations.service_tools.chatterbox_tool', 

1338 'chatterbox_ml_synthesize', 

1339 text, language, voice, output_path, 

1340 ) 

1341 elif engine_id == 'f5_tts': 

1342 return self._call_gpu_engine( 

1343 'integrations.service_tools.f5_tts_tool', 

1344 'f5_synthesize', 

1345 text, language, voice, output_path, 

1346 ) 

1347 elif engine_id == 'kokoro': 

1348 return self._call_gpu_engine( 

1349 'integrations.service_tools.kokoro_tool', 

1350 'kokoro_synthesize', 

1351 text, language, voice, output_path, 

1352 ) 

1353 elif engine_id == 'melotts': 

1354 return self._call_gpu_engine( 

1355 'integrations.service_tools.melotts_tool', 

1356 'melotts_synthesize', 

1357 text, language, voice, output_path, 

1358 ) 

1359 elif engine_id == 'xtts_v2': 

1360 return self._call_gpu_engine( 

1361 'integrations.service_tools.xtts_tool', 

1362 'xtts_synthesize', 

1363 text, language, voice, output_path, 

1364 ) 

1365 elif engine_id == 'mms_tts': 

1366 return self._call_gpu_engine( 

1367 'integrations.service_tools.mms_tts_tool', 

1368 'mms_tts_synthesize', 

1369 text, language, voice, output_path, 

1370 ) 

1371 return {'error': f'Unknown engine: {engine_id}'} 

1372 

1373 def _call_luxtts(self, text, voice, output_path, device): 

1374 from integrations.service_tools.luxtts_tool import luxtts_synthesize 

1375 result_str = luxtts_synthesize( 

1376 text, voice_audio=voice, output_path=output_path, device=device, 

1377 ) 

1378 return json.loads(result_str) 

1379 

1380 def _call_pocket_tts(self, text, voice, output_path): 

1381 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize 

1382 voice_name = voice if voice and voice != 'default' else 'alba' 

1383 result_str = pocket_tts_synthesize(text, voice_name, output_path) 

1384 return json.loads(result_str) 

1385 

1386 def _call_espeak(self, text, language, output_path): 

1387 """Call espeak-ng via pocket_tts_tool (DRY — reuses existing impl).""" 

1388 import os 

1389 

1390 if not output_path: 

1391 out_dir = os.environ.get('TTS_TEMP_DIR', '/tmp/tts') 

1392 os.makedirs(out_dir, exist_ok=True) 

1393 output_path = os.path.join(out_dir, f'espeak_{int(time.time()*1000)}.wav') 

1394 

1395 try: 

1396 from integrations.service_tools.pocket_tts_tool import _espeak_synthesize 

1397 espeak_lang = language if language else 'en' 

1398 if _espeak_synthesize(text[:5000], output_path, voice=espeak_lang): 

1399 return { 

1400 'path': output_path, 

1401 'duration': len(text.split()) / 150 * 60, # estimate 

1402 'sample_rate': 22050, 

1403 'voice': espeak_lang, 

1404 'engine': 'espeak-ng', 

1405 } 

1406 return {'error': 'espeak-ng not installed'} 

1407 except (ImportError, Exception): 

1408 return {'error': 'espeak-ng not available'} 

1409 

1410 def _call_gpu_engine(self, module_path, function_name, text, language, 

1411 voice, output_path): 

1412 """Generic caller for GPU TTS service tools.""" 

1413 import importlib 

1414 try: 

1415 mod = importlib.import_module(module_path) 

1416 fn = getattr(mod, function_name) 

1417 result_str = fn(text, language=language, voice=voice, 

1418 output_path=output_path) 

1419 return json.loads(result_str) 

1420 except ImportError as e: 

1421 return {'error': f'{module_path} not installed: {e}'} 

1422 except Exception as e: 

1423 return {'error': str(e)} 

1424 

1425 def _execute_makeittalk(self, text, voice): 

1426 """Cloud TTS via MakeItTalk API.""" 

1427 import os 

1428 base_url = os.environ.get('MAKEITTALK_API_URL') 

1429 if not base_url: 

1430 return {'error': 'MAKEITTALK_API_URL not set'} 

1431 try: 

1432 import requests 

1433 resp = requests.post( 

1434 f"{base_url}/video-gen/", 

1435 json={ 

1436 'text': text, 

1437 'voiceName': voice or 'af_bella', 

1438 'audio_only': True, 

1439 }, 

1440 timeout=30, 

1441 ) 

1442 if resp.status_code == 200: 

1443 data = resp.json() 

1444 audio_url = data.get('audio_url') or data.get('url', '') 

1445 return { 

1446 'path': audio_url, 

1447 'duration': data.get('duration', 0), 

1448 'voice': voice or 'af_bella', 

1449 'engine': 'makeittalk', 

1450 'sample_rate': 24000, 

1451 } 

1452 return {'error': f'MakeItTalk HTTP {resp.status_code}'} 

1453 except Exception as e: 

1454 return {'error': f'MakeItTalk: {e}'} 

1455 

1456 def get_engine_status(self) -> List[Dict[str, Any]]: 

1457 """Report status of all TTS engines for diagnostics.""" 

1458 gpu_info = _get_gpu_info() 

1459 has_gpu = gpu_info.get('cuda_available', False) 

1460 statuses = [] 

1461 

1462 for eid, spec in ENGINE_REGISTRY.items(): 

1463 installed = _is_engine_installed(eid) 

1464 can_run = False 

1465 device = 'n/a' 

1466 

1467 if spec.device == TTSDevice.CPU_ONLY: 

1468 can_run = installed 

1469 device = 'cpu' 

1470 elif spec.device == TTSDevice.GPU_ONLY: 

1471 can_run = installed and has_gpu and _can_fit_on_gpu(eid) 

1472 device = 'gpu' if can_run else 'n/a' 

1473 elif spec.device == TTSDevice.GPU_PREFERRED: 

1474 can_run = installed 

1475 device = 'gpu' if (has_gpu and _can_fit_on_gpu(eid)) else 'cpu' 

1476 elif spec.device == TTSDevice.CLOUD: 

1477 can_run = installed 

1478 device = 'cloud' 

1479 

1480 statuses.append({ 

1481 'engine': eid, 

1482 'installed': installed, 

1483 'can_run': can_run, 

1484 'device': device, 

1485 'languages': list(spec.languages), 

1486 'quality': spec.quality, 

1487 'voice_clone': spec.voice_clone, 

1488 'vram_gb': spec.vram_key, 

1489 }) 

1490 

1491 return statuses 

1492 

1493 def get_all_voices(self) -> List[Dict[str, Any]]: 

1494 """Aggregate available voices from all installed TTS engines.""" 

1495 voices: List[Dict[str, Any]] = [] 

1496 try: 

1497 from integrations.service_tools.pocket_tts_tool import ( 

1498 _BUILTIN_VOICES, 

1499 ) 

1500 for v in _BUILTIN_VOICES: 

1501 voices.append({'id': v, 'engine': 'pocket_tts', 'type': 'builtin'}) 

1502 except (ImportError, Exception): 

1503 pass 

1504 try: 

1505 from integrations.service_tools.luxtts_tool import luxtts_list_voices 

1506 import json as _json 

1507 result = _json.loads(luxtts_list_voices()) 

1508 for v in result.get('voices', []): 

1509 voices.append({'id': v.get('id', ''), 'engine': 'luxtts', 'type': 'cloned'}) 

1510 except (ImportError, Exception): 

1511 pass 

1512 return voices 

1513 

1514 

1515# ═══════════════════════════════════════════════════════════════ 

1516# Singleton 

1517# ═══════════════════════════════════════════════════════════════ 

1518 

1519_router_instance: Optional[TTSRouter] = None 

1520 

1521 

1522def get_tts_router() -> TTSRouter: 

1523 """Get the singleton TTS router.""" 

1524 global _router_instance 

1525 if _router_instance is None: 

1526 _router_instance = TTSRouter() 

1527 return _router_instance 

1528 

1529 

1530# ═══════════════════════════════════════════════════════════════ 

1531# ModelCatalog integration — populate_tts_catalog() 

1532# ═══════════════════════════════════════════════════════════════ 

1533 

1534# Reflection-dispatch contract for catalog entries that have NO 

1535# `tool_module` (pure-JSON model registration via admin UI / hive 

1536# federation / model_catalog.json edit). An entry without `tool_module` 

1537# MUST declare every field below in its `capabilities` dict — otherwise 

1538# the dispatcher has no way to know how to instantiate the class, marshal 

1539# the request, or normalize the return. See task #58 for the full 

1540# rationale; the schema is finalized at 5 fields, no more. 

1541_REFLECTION_FIELDS: Tuple[str, ...] = ( 

1542 'import_path', # 'pkg.module:ClassName' 

1543 'init_args', # dict — kwargs for ClassName(**init_args); {} OK 

1544 'synth_method', # str — instance method name 

1545 'params_map', # dict — {payload_key → method_kwarg} 

1546 'output_format', # canonical id (see _OUTPUT_FORMATS below) 

1547) 

1548 

1549# Canonical return-shape identifiers the reflection dispatcher knows 

1550# how to normalize into a wire-format wav (or path). Engines that 

1551# return shapes outside this set MUST use the `tool_module` escape 

1552# hatch instead — the dispatcher won't guess. 

1553_OUTPUT_FORMATS: Tuple[str, ...] = ( 

1554 'wav_bytes', # bytes object holding a WAV-formatted byte stream 

1555 'numpy_24k', # 1-D float32 numpy array @ 24 kHz mono 

1556 'file_path', # str path to a wav file the engine wrote 

1557 'bytesio', # io.BytesIO containing wav bytes 

1558) 

1559 

1560 

1561def _validate_engine_caps(caps: Dict[str, Any]) -> Optional[str]: 

1562 """Validate a TTS catalog entry's capabilities dict. 

1563 

1564 Returns None when the entry is dispatchable, OR a human-readable 

1565 error string when it is not. Two valid shapes: 

1566 

1567 1. Python-tool path (escape hatch): 

1568 caps['tool_module'] = 'pkg.module' # required 

1569 The entry will be dispatched via the existing 

1570 `gpu_worker._dispatch_and_run` path: import the module, pick 

1571 up `_load[_<variant>]` / `_synthesize[_<variant>]` callbacks 

1572 by convention. This is what every code-shipped engine in 

1573 ENGINE_REGISTRY uses today. 

1574 

1575 2. Pure-config / reflection path: 

1576 caps lacks tool_module BUT declares ALL of _REFLECTION_FIELDS. 

1577 The dispatcher will use reflection to instantiate the class 

1578 and call the synth method — no .py file needed for adding 

1579 new models that fit a homogeneous load+method API (Kokoro, 

1580 Pocket-TTS, etc., evaluated empirically per engine). 

1581 

1582 Validation fires at INGEST time (populate_tts_catalog upsert path 

1583 AND _catalog_entry_to_spec read path) so a malformed entry cannot 

1584 reach the dispatcher. This guards against the "user discovers the 

1585 error only when they request the voice" failure mode. 

1586 """ 

1587 if not isinstance(caps, dict): 

1588 return f'capabilities must be a dict, got {type(caps).__name__}' 

1589 

1590 if caps.get('tool_module'): 

1591 # Python-tool entry — tool_module on its own is sufficient. The 

1592 # dispatcher will pick up _load / _synthesize via convention. 

1593 return None 

1594 

1595 # Reflection entry — every field is required. No partial schemas. 

1596 missing = [f for f in _REFLECTION_FIELDS if f not in caps] 

1597 if missing: 

1598 return ( 

1599 f'entry has no tool_module and is missing reflection fields ' 

1600 f'{missing}; reflection dispatch needs the full 5-field ' 

1601 f'contract: {list(_REFLECTION_FIELDS)}' 

1602 ) 

1603 

1604 # Cheap shape sanity — early-fail with a precise message rather than 

1605 # let the dispatcher trip on a bad type at synth time. 

1606 if not isinstance(caps.get('init_args'), dict): 

1607 return f'init_args must be a dict, got {type(caps.get("init_args")).__name__}' 

1608 if not isinstance(caps.get('params_map'), dict): 

1609 return f'params_map must be a dict, got {type(caps.get("params_map")).__name__}' 

1610 if not isinstance(caps.get('synth_method'), str) or not caps['synth_method']: 

1611 return 'synth_method must be a non-empty str' 

1612 if not isinstance(caps.get('import_path'), str) or ':' not in caps['import_path']: 

1613 return ( 

1614 f'import_path must be "pkg.module:ClassName", got ' 

1615 f'{caps.get("import_path")!r}' 

1616 ) 

1617 if caps.get('output_format') not in _OUTPUT_FORMATS: 

1618 return ( 

1619 f'output_format must be one of {list(_OUTPUT_FORMATS)}, got ' 

1620 f'{caps.get("output_format")!r}' 

1621 ) 

1622 return None 

1623 

1624 

1625# Human-readable display names for each engine (used in admin UI) 

1626_ENGINE_DISPLAY_NAMES: Dict[str, str] = { 

1627 'chatterbox_turbo': 'Chatterbox Turbo (GPU, English, voice-clone)', 

1628 'luxtts': 'LuxTTS (CPU, English, voice-clone)', 

1629 'cosyvoice3': 'CosyVoice 3 (GPU, multilingual, voice-clone)', 

1630 'f5_tts': 'F5-TTS (GPU, EN/ZH, voice-clone)', 

1631 'indic_parler': 'Indic Parler-TTS (GPU, 22 Indic languages)', 

1632 'chatterbox_ml': 'Chatterbox Multilingual (GPU, 23 languages, voice-clone)', 

1633 'pocket_tts': 'Pocket TTS (CPU, English, voice-clone)', 

1634 'kokoro': 'Kokoro 82M (CPU/GPU, English, neural)', 

1635 'espeak': 'eSpeak-NG (CPU, 100+ languages, instant fallback)', 

1636 'makeittalk': 'MakeItTalk (Cloud, English)', 

1637 'melotts': 'MeloTTS (CPU/GPU, 6 langs, neural)', 

1638 'xtts_v2': 'XTTS-v2 (GPU, 17 langs, voice-clone)', 

1639 'mms_tts': 'MMS-TTS (CPU/GPU, 50+ langs via VITS)', 

1640} 

1641 

1642# Extra capabilities per engine that don't map 1-to-1 onto TTSEngineSpec fields 

1643_ENGINE_EXTRA_CAPS: Dict[str, Dict[str, Any]] = { 

1644 'chatterbox_turbo': { 

1645 'streaming': False, 

1646 'paralinguistic': ['emotion_happy', 'emotion_sad', 'emotion_angry', 

1647 'emotion_surprised', 'laughing', 'whispering'], 

1648 'emotion_tags': True, 

1649 }, 

1650 'luxtts': { 

1651 'streaming': False, 

1652 'paralinguistic': [], 

1653 'emotion_tags': False, 

1654 }, 

1655 'cosyvoice3': { 

1656 'streaming': True, 

1657 'paralinguistic': ['emotion_happy', 'emotion_sad', 'whispering'], 

1658 'emotion_tags': True, 

1659 }, 

1660 'f5_tts': { 

1661 'streaming': False, 

1662 'paralinguistic': [], 

1663 'emotion_tags': False, 

1664 }, 

1665 'indic_parler': { 

1666 'streaming': False, 

1667 'paralinguistic': [], 

1668 'emotion_tags': False, 

1669 }, 

1670 'chatterbox_ml': { 

1671 'streaming': False, 

1672 'paralinguistic': ['emotion_happy', 'emotion_sad', 'whispering'], 

1673 'emotion_tags': True, 

1674 }, 

1675 'pocket_tts': { 

1676 'streaming': False, 

1677 'paralinguistic': [], 

1678 'emotion_tags': False, 

1679 }, 

1680 'kokoro': { 

1681 'streaming': False, 

1682 'paralinguistic': [], 

1683 'emotion_tags': False, 

1684 }, 

1685 'espeak': { 

1686 'streaming': False, 

1687 'paralinguistic': [], 

1688 'emotion_tags': False, 

1689 }, 

1690 'makeittalk': { 

1691 'streaming': False, 

1692 'paralinguistic': [], 

1693 'emotion_tags': False, 

1694 }, 

1695 'melotts': { 

1696 'streaming': False, 

1697 'paralinguistic': [], 

1698 'emotion_tags': False, 

1699 }, 

1700 'xtts_v2': { 

1701 'streaming': False, 

1702 'paralinguistic': [], 

1703 'emotion_tags': False, 

1704 }, 

1705 'mms_tts': { 

1706 'streaming': False, 

1707 'paralinguistic': [], 

1708 'emotion_tags': False, 

1709 }, 

1710} 

1711 

1712# Device → backend string mapping for ModelEntry.backend field 

1713_DEVICE_TO_BACKEND: Dict[str, str] = { 

1714 TTSDevice.GPU_ONLY.value: 'torch', 

1715 TTSDevice.GPU_PREFERRED.value: 'torch', 

1716 TTSDevice.CPU_ONLY.value: 'in_process', 

1717 TTSDevice.CLOUD.value: 'api', 

1718} 

1719 

1720# Device → supports_gpu / supports_cpu flags 

1721_DEVICE_TO_COMPUTE: Dict[str, Tuple[bool, bool]] = { 

1722 # (supports_gpu, supports_cpu) 

1723 TTSDevice.GPU_ONLY.value: (True, False), 

1724 TTSDevice.GPU_PREFERRED.value: (True, True), 

1725 TTSDevice.CPU_ONLY.value: (False, True), 

1726 TTSDevice.CLOUD.value: (False, False), 

1727} 

1728 

1729# DEPRECATED: VRAM specs now live in vram_manager.VRAM_BUDGETS (single 

1730# source of truth). Use _engine_vram_gb(engine_id) helper below. 

1731# This dict is kept for backward compatibility but should NOT be edited. 

1732_ENGINE_VRAM_GB: Dict[str, float] = {} # populated lazily by _engine_vram_gb 

1733 

1734 

1735def _engine_vram_gb(engine_id: str) -> float: 

1736 """Single source of truth for engine VRAM requirement. 

1737 

1738 Reads from vram_manager.VRAM_BUDGETS — the canonical specs. 

1739 The vram_manager key convention is 'tts_<engine_id>' (e.g. 'tts_indic_parler'). 

1740 Returns 0.0 only if engine has no GPU requirement (CPU-only engine). 

1741 Logs a warning if engine is GPU-capable but missing from VRAM_BUDGETS 

1742 (catches drift between the two registries). 

1743 """ 

1744 if engine_id in _ENGINE_VRAM_GB: 

1745 return _ENGINE_VRAM_GB[engine_id] 

1746 try: 

1747 from integrations.service_tools.vram_manager import VRAM_BUDGETS 

1748 key = f'tts_{engine_id}' 

1749 if key in VRAM_BUDGETS: 

1750 vram = VRAM_BUDGETS[key][0] # (gpu_gb, cpu_gb) 

1751 _ENGINE_VRAM_GB[engine_id] = vram 

1752 return vram 

1753 # Engine not registered in vram_manager — log once, assume CPU 

1754 logger.debug( 

1755 "TTS engine %r has no VRAM_BUDGETS entry (key=%r) — " 

1756 "assuming CPU-only. Add to vram_manager.VRAM_BUDGETS if GPU-capable.", 

1757 engine_id, key, 

1758 ) 

1759 except ImportError: 

1760 logger.debug("vram_manager unavailable, assuming CPU-only for %r", engine_id) 

1761 _ENGINE_VRAM_GB[engine_id] = 0.0 

1762 return 0.0 

1763 

1764# Approximate disk footprint per engine (GB) 

1765_ENGINE_DISK_GB: Dict[str, float] = { 

1766 'chatterbox_turbo': 2.0, 

1767 'luxtts': 0.5, 

1768 'cosyvoice3': 3.5, 

1769 'f5_tts': 2.5, 

1770 'indic_parler': 4.0, 

1771 'chatterbox_ml': 3.0, 

1772 'pocket_tts': 0.1, 

1773 'espeak': 0.05, 

1774 'makeittalk': 0.0, 

1775 'melotts': 1.5, # 6 per-lang checkpoints, ~250 MB each 

1776 'xtts_v2': 2.0, # weights + speakers + config 

1777 'mms_tts': 0.2, # ~150 MB per lang lazy-downloaded 

1778} 

1779 

1780# Approximate RAM needed for CPU-capable engines (GB) 

1781_ENGINE_RAM_GB: Dict[str, float] = { 

1782 'chatterbox_turbo': 2.0, 

1783 'luxtts': 2.0, 

1784 'cosyvoice3': 4.0, 

1785 'f5_tts': 2.0, 

1786 'indic_parler': 4.0, 

1787 'chatterbox_ml': 4.0, 

1788 'pocket_tts': 0.5, 

1789 'espeak': 0.1, 

1790 'makeittalk': 0.1, 

1791 'melotts': 2.0, 

1792 'xtts_v2': 3.0, 

1793 'mms_tts': 1.5, 

1794} 

1795 

1796 

1797def populate_tts_catalog(catalog) -> int: 

1798 """Convert ENGINE_REGISTRY into ModelEntry objects and register them. 

1799 

1800 Called by ModelCatalog.populate_from_subsystems() via the populator 

1801 plugin mechanism — keeps tts_router as the single source of truth for 

1802 TTS engine capabilities. 

1803 

1804 Validation contract (#58): admin- or hive-supplied catalog entries 

1805 that exist BEFORE this populator runs are validated against 

1806 `_validate_engine_caps`. Invalid entries are removed from the 

1807 catalog with a logged WARNING — they cannot reach the dispatcher. 

1808 This is the "fail-fast at catalog ingest, not synth time" half of 

1809 the contract; the other half (validation on every read) lives in 

1810 `_catalog_entry_to_spec`. 

1811 

1812 Args: 

1813 catalog: ModelCatalog instance (accepts Any to avoid a hard import 

1814 at module level — the catalog is passed in by the caller). 

1815 

1816 Returns: 

1817 Number of new entries added (skips already-registered IDs). 

1818 """ 

1819 # Lazy import inside function body — avoids circular import at module load 

1820 from integrations.service_tools.model_catalog import ModelEntry, ModelType 

1821 

1822 # Pre-pass: validate any existing TTS entries (admin/hive seeded the 

1823 # catalog before us). Invalid entries are removed + logged so they 

1824 # don't poison `_refresh_engine_registry_from_catalog` below. Code- 

1825 # shipped engines (ENGINE_REGISTRY) ALWAYS have tool_module so they 

1826 # never trip this; the gate exists for foreign manifests. 

1827 _drop_ids: List[str] = [] 

1828 for entry in list(catalog.list_by_type('tts')): 

1829 err = _validate_engine_caps(entry.capabilities or {}) 

1830 if err: 

1831 logger.warning( 

1832 'TTS catalog entry %r rejected at ingest: %s', entry.id, err, 

1833 ) 

1834 _drop_ids.append(entry.id) 

1835 # #58 Scope-2 (2026-05-07): reflection-only entries (caps lack 

1836 # tool_module but declare the full 5-field contract) are now 

1837 # dispatchable via `gpu_worker._dispatch_catalog_id` (`python -m 

1838 # gpu_worker --catalog-id <id>`). They survive ingest as long as 

1839 # `_validate_engine_caps` passes; they are EXCLUDED from the 

1840 # ENGINE_REGISTRY snapshot by `_refresh_engine_registry_from_catalog` 

1841 # because TTSEngineSpec carries `tool_module` as a non-optional 

1842 # dispatch handle for the existing call sites. The catalog reads 

1843 # them via the --catalog-id path instead. 

1844 for _eid in _drop_ids: 

1845 try: 

1846 catalog.unregister(_eid, persist=False) 

1847 except Exception as _re: 

1848 logger.debug('failed to unregister invalid TTS entry %r: %s', 

1849 _eid, _re) 

1850 

1851 added = 0 

1852 for engine_id, spec in ENGINE_REGISTRY.items(): 

1853 # Skip if already registered (preserves user edits from admin UI) 

1854 if catalog.get(f'tts-{engine_id.replace("_", "-")}') is not None: 

1855 continue 

1856 

1857 device_value = spec.device.value 

1858 supports_gpu, supports_cpu = _DEVICE_TO_COMPUTE.get( 

1859 device_value, (False, True) 

1860 ) 

1861 backend = _DEVICE_TO_BACKEND.get(device_value, 'in_process') 

1862 

1863 # Build language_priority from LANG_ENGINE_PREFERENCE: 

1864 # lower rank in the preference list → lower priority number → preferred 

1865 lang_priority: Dict[str, int] = {} 

1866 for lang, engine_list in LANG_ENGINE_PREFERENCE.items(): 

1867 if engine_id in engine_list: 

1868 rank = engine_list.index(engine_id) # 0 = most preferred 

1869 lang_priority[lang] = rank * 10 # 0, 10, 20, ... 

1870 

1871 # Pick the best latency figure for quality/speed scores 

1872 best_latency_ms = min( 

1873 (v for v in (spec.latency_gpu_ms, spec.latency_cpu_ms, 

1874 spec.latency_cloud_ms) if v > 0), 

1875 default=5000, 

1876 ) 

1877 # speed_score: 1.0 = instant (≤10 ms), 0.0 = very slow (≥5000 ms) 

1878 speed_score = max(0.0, 1.0 - (best_latency_ms - 10) / 4990) 

1879 

1880 # Build capabilities dict — TTS-specific fields + extras 

1881 extra = _ENGINE_EXTRA_CAPS.get(engine_id, {}) 

1882 capabilities: Dict[str, Any] = { 

1883 'voice_clone': spec.voice_clone, 

1884 'sample_rate': spec.sample_rate, 

1885 'latency_gpu_ms': spec.latency_gpu_ms, 

1886 'latency_cpu_ms': spec.latency_cpu_ms, 

1887 'latency_cloud_ms': spec.latency_cloud_ms, 

1888 'tool_module': spec.tool_module, 

1889 'tool_function': spec.tool_function, 

1890 'vram_key': spec.vram_key, 

1891 'streaming': extra.get('streaming', False), 

1892 'paralinguistic': extra.get('paralinguistic', []), 

1893 'emotion_tags': extra.get('emotion_tags', False), 

1894 } 

1895 

1896 # languages list — ('*',) means "all"; store as-is so select_best 

1897 # language matching still works (catalog treats '*' as wildcard) 

1898 languages = list(spec.languages) 

1899 

1900 entry = ModelEntry( 

1901 id=f'tts-{engine_id.replace("_", "-")}', 

1902 name=_ENGINE_DISPLAY_NAMES.get(engine_id, engine_id), 

1903 model_type=ModelType.TTS, 

1904 version='1.0', 

1905 source='cloud' if spec.device == TTSDevice.CLOUD else 'local', 

1906 vram_gb=_engine_vram_gb(engine_id), 

1907 ram_gb=_ENGINE_RAM_GB.get(engine_id, 0.5), 

1908 disk_gb=_ENGINE_DISK_GB.get(engine_id, 0.0), 

1909 min_capability_tier='lite' if supports_cpu else 'standard', 

1910 backend=backend, 

1911 supports_gpu=supports_gpu, 

1912 supports_cpu=supports_cpu, 

1913 supports_cpu_offload=False, 

1914 idle_timeout_s=300.0, 

1915 capabilities=capabilities, 

1916 quality_score=spec.quality, 

1917 speed_score=round(speed_score, 3), 

1918 priority=50, 

1919 languages=languages, 

1920 language_priority=lang_priority, 

1921 tags=['tts', 'local' if spec.device != TTSDevice.CLOUD else 'cloud'], 

1922 enabled=True, 

1923 auto_load=False, 

1924 ) 

1925 catalog.register(entry, persist=False) 

1926 added += 1 

1927 

1928 # Post-upsert: rebuild ENGINE_REGISTRY in place so it reflects the 

1929 # current catalog state (admin/hive-edited entries become visible 

1930 # to existing call sites). Snapshot semantics — runtime catalog 

1931 # mutations after this point do NOT auto-propagate; a re-bootstrap 

1932 # is required. Matches the dict-iter assumption every existing 

1933 # ENGINE_REGISTRY caller relies on. See task #58 acceptance #5. 

1934 _refresh_engine_registry_from_catalog(catalog) 

1935 

1936 return added 

1937 

1938 

1939def _refresh_engine_registry_from_catalog(catalog) -> int: 

1940 """Rebuild ENGINE_REGISTRY in place from the post-upsert catalog. 

1941 

1942 Reflection-only entries (no tool_module) are excluded — they live 

1943 only in the catalog and are dispatched via the `--catalog-id` 

1944 path. TTSEngineSpec callers continue to see only spec-shaped 

1945 entries, exactly as before this refactor. 

1946 

1947 Returns the number of entries in the rebuilt registry. 

1948 

1949 Idempotent: calling twice with the same catalog state produces the 

1950 same registry contents. 

1951 """ 

1952 new_entries: Dict[str, TTSEngineSpec] = {} 

1953 for entry in catalog.list_by_type('tts'): 

1954 spec = _catalog_entry_to_spec(entry) 

1955 if spec is None: 

1956 continue # validation failed, or reflection-only entry 

1957 new_entries[spec.engine_id] = spec 

1958 ENGINE_REGISTRY.clear() 

1959 ENGINE_REGISTRY.update(new_entries) 

1960 return len(new_entries) 

1961 

1962 

1963def _catalog_entry_to_spec(entry) -> Optional[TTSEngineSpec]: 

1964 """Convert a ModelCatalog ModelEntry back to a TTSEngineSpec. 

1965 

1966 Used by code that needs a TTSEngineSpec but only has a catalog entry 

1967 (e.g. when the router consults the catalog for dynamically registered 

1968 engines that were not present in ENGINE_REGISTRY at startup). 

1969 

1970 Returns None if: 

1971 * the entry's capabilities fail validation (#58 contract — see 

1972 `_validate_engine_caps`); the caller should NOT see that entry 

1973 because the dispatcher cannot route to it. 

1974 * the entry uses the reflection-only dispatch path (no tool_module). 

1975 TTSEngineSpec carries `tool_module` as a non-optional dispatch 

1976 handle for the existing call sites; reflection-only entries are 

1977 dispatched directly from the catalog and are intentionally 

1978 excluded from the ENGINE_REGISTRY snapshot. 

1979 """ 

1980 caps = entry.capabilities or {} 

1981 err = _validate_engine_caps(caps) 

1982 if err: 

1983 # Loud at ingest, silent on subsequent re-reads — the catalog 

1984 # populator/loader already logged this; don't spam every read. 

1985 return None 

1986 tool_module = caps.get('tool_module') 

1987 if not tool_module: 

1988 # Valid reflection-only entry, but TTSEngineSpec needs a 

1989 # tool_module. Caller (`_refresh_engine_registry_from_catalog`) 

1990 # will skip None entries and dispatch reflection-only IDs via 

1991 # the catalog path instead. 

1992 return None 

1993 tool_function = caps.get('tool_function') 

1994 

1995 # Determine TTSDevice from backend + supports_* flags 

1996 if caps.get('latency_cloud_ms', 0) > 0 and not entry.supports_gpu and not entry.supports_cpu: 

1997 device = TTSDevice.CLOUD 

1998 elif entry.supports_gpu and not entry.supports_cpu: 

1999 device = TTSDevice.GPU_ONLY 

2000 elif entry.supports_gpu and entry.supports_cpu: 

2001 device = TTSDevice.GPU_PREFERRED 

2002 else: 

2003 device = TTSDevice.CPU_ONLY 

2004 

2005 # Strip the 'tts-' prefix that populate_tts_catalog adds 

2006 raw_id = entry.id[4:] if entry.id.startswith('tts-') else entry.id 

2007 

2008 return TTSEngineSpec( 

2009 engine_id=raw_id, 

2010 device=device, 

2011 vram_key=caps.get('vram_key', ''), 

2012 languages=tuple(entry.languages) if entry.languages else ('en',), 

2013 quality=entry.quality_score, 

2014 voice_clone=caps.get('voice_clone', False), 

2015 latency_gpu_ms=caps.get('latency_gpu_ms', 0), 

2016 latency_cpu_ms=caps.get('latency_cpu_ms', 0), 

2017 latency_cloud_ms=caps.get('latency_cloud_ms', 0), 

2018 tool_module=tool_module, 

2019 tool_function=tool_function, 

2020 sample_rate=caps.get('sample_rate', 24000), 

2021 )