Coverage for integrations / service_tools / neutts_tool.py: 0.0%

121 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1"""TTS tool — text-to-speech via NeuTTS Air (Neuphonic). 

2 

3NeuTTS Air benefits: 

4 - 748M params on Qwen2 backbone, GGUF Q4 (~600MB) / Q8 (~800MB) 

5 - Apache 2.0, fully open source 

6 - On-device: RTF<0.5 on CPU, 24kHz output 

7 - Instant voice cloning from 3-15s reference audio 

8 - English primary; CPU-friendly (no GPU required for Q4) 

9 

10Pip package: ``neutts`` on PyPI. Optional extras: ``neutts[all]`` pulls 

11``llama-cpp-python`` (GGUF inference) + ``soundfile`` + ``onnxruntime`` 

12(codec decoder). License Apache-2.0. 

13 

14Architecture (ToolWorker pattern — same as kokoro / chatterbox / f5): 

15 - This module exposes ``_load`` + ``_synthesize`` (subprocess-side 

16 callbacks) and ``_tool`` (parent-side ``ToolWorker`` instance). 

17 - ``neutts_synthesize`` (the canonical public entry point referenced 

18 by ``tts_router.ENGINE_REGISTRY['neutts_air'].tool_function``) 

19 forwards through ``_tool.synthesize``. 

20 - The desktop installer (Nunba) routes ``neutts`` into a dedicated 

21 venv (``install_target='venv'`` in tts_router.py) because 

22 ``neutts[all]`` pulls ``llama-cpp-python`` whose torch / numpy 

23 pins can drift from the main interpreter. ``ToolWorker``'s 

24 ``python_exe`` is wired to the venv's python at install time so 

25 the synth subprocess sees the pinned neutts deps. 

26 

27Reference voices (NeuTTS requires a reference audio + transcript for 

28cloning — there is no zero-config default voice the way pocket_tts has 

29built-in 'alba'/'jean'/etc.). Resolution order: 

30 1. Path to a .wav (with companion .txt at the same stem) — ad-hoc 

31 2. Custom name → ``~/.hevolve/models/tts/neutts/voices/<name>.wav`` 

32 (with companion .txt) — persistent user-cloned voices 

33 3. ``'jo'`` (default) → upstream sample at 

34 ``<site-packages>/neutts/samples/jo.{wav,txt}`` 

35 

36Model downloaded lazily on first use (HuggingFace 

37``neuphonic/neutts-air-q4-gguf`` backbone + ``neuphonic/neucodec`` 

38codec). Env overrides: 

39 - ``NEUTTS_BACKBONE_REPO`` (default ``neuphonic/neutts-air-q4-gguf``) 

40 - ``NEUTTS_BACKBONE_DEVICE`` (default ``cpu``) 

41 - ``NEUTTS_CODEC_REPO`` (default ``neuphonic/neucodec``) 

42 - ``NEUTTS_CODEC_DEVICE`` (default ``cpu``) 

43 

44Public API (parent side): 

45 neutts_synthesize(text, voice, output_path, language) -> JSON 

46 neutts_list_voices() -> JSON 

47 unload_neutts() -> None 

48""" 

49 

50import json 

51import logging 

52import os 

53from pathlib import Path 

54from typing import Optional, Tuple 

55 

56from integrations.service_tools.gpu_worker import ToolWorker 

57 

58from .registry import ServiceToolInfo, service_tool_registry 

59 

60logger = logging.getLogger(__name__) 

61 

62 

63# ═══════════════════════════════════════════════════════════════ 

64# Storage paths (parent + subprocess use the same resolver — single 

65# source of truth for "where do user-cloned voice files live?") 

66# ═══════════════════════════════════════════════════════════════ 

67 

68def _get_tts_dir() -> Path: 

69 """Get the NeuTTS storage directory.""" 

70 try: 

71 from .model_storage import model_storage 

72 tts_dir = model_storage.get_tool_dir("neutts") 

73 except (ImportError, Exception): 

74 tts_dir = Path(os.path.expanduser("~/.hevolve/models/tts/neutts")) 

75 tts_dir.mkdir(parents=True, exist_ok=True) 

76 return tts_dir 

77 

78 

79def _get_output_dir() -> Path: 

80 """Get the audio output directory.""" 

81 out_dir = _get_tts_dir() / "output" 

82 out_dir.mkdir(parents=True, exist_ok=True) 

83 return out_dir 

84 

85 

86def _get_voices_dir() -> Path: 

87 """Get the directory for user-cloned voice references.""" 

88 vdir = _get_tts_dir() / "voices" 

89 vdir.mkdir(parents=True, exist_ok=True) 

90 return vdir 

91 

92 

93def _resolve_reference(voice: str) -> Tuple[Optional[str], Optional[str]]: 

94 """Resolve a `voice` argument to (ref_audio_path, ref_text). 

95 

96 Resolution order: 

97 1. Path to a .wav file (companion .txt at same stem for transcript) 

98 2. Custom voice name → ~/.hevolve/models/tts/neutts/voices/{name}.wav 

99 3. Built-in name 'jo' → upstream sample shipped with neutts package 

100 4. Anything else → (None, None) — caller MUST treat as failure 

101 """ 

102 # 1. Direct path 

103 if voice and os.path.isfile(voice): 

104 wav = voice 

105 txt_path = os.path.splitext(voice)[0] + '.txt' 

106 if os.path.isfile(txt_path): 

107 with open(txt_path, encoding='utf-8') as fp: 

108 return wav, fp.read().strip() 

109 # No transcript — refuse rather than guess 

110 return None, None 

111 

112 # 2. Custom user-cloned voice in our voices dir 

113 custom_wav = _get_voices_dir() / f"{voice}.wav" 

114 custom_txt = _get_voices_dir() / f"{voice}.txt" 

115 if custom_wav.is_file() and custom_txt.is_file(): 

116 with open(custom_txt, encoding='utf-8') as fp: 

117 return str(custom_wav), fp.read().strip() 

118 

119 # 3. Upstream sample 'jo' shipped with the neutts package 

120 if voice == 'jo': 

121 try: 

122 import neutts # noqa: F401 

123 pkg_dir = Path(neutts.__file__).parent 

124 sample_wav = pkg_dir / 'samples' / 'jo.wav' 

125 sample_txt = pkg_dir / 'samples' / 'jo.txt' 

126 if sample_wav.is_file() and sample_txt.is_file(): 

127 with open(sample_txt, encoding='utf-8') as fp: 

128 return str(sample_wav), fp.read().strip() 

129 except ImportError: 

130 return None, None 

131 

132 return None, None 

133 

134 

135# ═══════════════════════════════════════════════════════════════ 

136# Subprocess-side callbacks (invoked by gpu_worker dispatcher) 

137# ═══════════════════════════════════════════════════════════════ 

138 

139def _load() -> dict: 

140 """Load NeuTTS Air once at subprocess startup. 

141 

142 Default device is CPU because NeuTTS's Q4 GGUF runs at <0.5x RTF 

143 on a modest consumer CPU per the upstream README — we don't burn 

144 GPU for an engine the CPU can already serve in real time. Users 

145 on big GPUs can override via ``NEUTTS_BACKBONE_DEVICE=cuda``. 

146 

147 Raises: 

148 ImportError: if the `neutts` package isn't installed. 

149 ToolWorker propagates this to the parent, which receives 

150 an `{error: ..., transient: false}` JSON and the TTS 

151 ladder traverses past us. 

152 """ 

153 try: 

154 from neutts import NeuTTS # type: ignore 

155 except ImportError as e: 

156 raise ImportError( 

157 f"neutts package not installed. " 

158 f"Install with: pip install neutts[all] ({e})" 

159 ) from e 

160 

161 backbone_device = os.environ.get('NEUTTS_BACKBONE_DEVICE', 'cpu') 

162 codec_device = os.environ.get('NEUTTS_CODEC_DEVICE', 'cpu') 

163 backbone_repo = os.environ.get( 

164 'NEUTTS_BACKBONE_REPO', 'neuphonic/neutts-air-q4-gguf') 

165 codec_repo = os.environ.get('NEUTTS_CODEC_REPO', 'neuphonic/neucodec') 

166 

167 logger.info( 

168 "Loading NeuTTS Air (backbone=%s on %s, codec=%s on %s)...", 

169 backbone_repo, backbone_device, codec_repo, codec_device, 

170 ) 

171 model = NeuTTS( 

172 backbone_repo=backbone_repo, 

173 backbone_device=backbone_device, 

174 codec_repo=codec_repo, 

175 codec_device=codec_device, 

176 ) 

177 logger.info("NeuTTS Air ready") 

178 return { 

179 'model': model, 

180 'device': backbone_device, 

181 # Reference codes are expensive to compute (run the codec 

182 # encoder over the wav). Cache per-voice for the life of the 

183 # subprocess so consecutive synth calls with the same voice 

184 # share the encode cost. 

185 'ref_cache': {}, 

186 } 

187 

188 

189def _synthesize(state, req: dict) -> dict: 

190 """Run one synthesis request inside the worker. 

191 

192 Args: 

193 state: dict returned by ``_load`` — holds the loaded model and 

194 the per-subprocess ref-codes cache. 

195 req: ``{text, voice, output_path, sample_rate?}`` request. 

196 

197 Returns: 

198 ``{path, duration, sample_rate, voice, engine}`` on success or 

199 ``{error, engine, transient}`` on failure. ``transient=False`` 

200 for "voice not configured" (deterministic — same input retries 

201 will fail the same way) and for missing-package errors. 

202 """ 

203 text = req.get('text', '') 

204 if not text or not text.strip(): 

205 return {'error': 'Text is required', 'engine': 'neutts_air'} 

206 

207 output_path = req.get('output_path') 

208 if not output_path: 

209 return {'error': 'output_path is required', 'engine': 'neutts_air'} 

210 

211 voice = req.get('voice') or 'jo' 

212 sample_rate = int(req.get('sample_rate') or 24000) 

213 

214 # Resolve + cache the reference codes (codec encoding is the slow 

215 # part; one-time cost per voice per subprocess). 

216 cache = state.get('ref_cache', {}) 

217 cached = cache.get(voice) 

218 if cached is None: 

219 ref_wav, ref_text = _resolve_reference(voice) 

220 if not ref_wav or not ref_text: 

221 return { 

222 'error': ( 

223 f"NeuTTS voice {voice!r} not configured (no reference " 

224 f"audio + transcript found). Provide a .wav with " 

225 f"companion .txt at the same stem, or use the " 

226 f"upstream 'jo' sample after installing the neutts " 

227 f"package." 

228 ), 

229 'engine': 'neutts_air', 

230 'transient': False, 

231 } 

232 try: 

233 ref_codes = state['model'].encode_reference(ref_wav) 

234 except Exception as e: 

235 return { 

236 'error': f"Reference encode failed: {type(e).__name__}: {e}", 

237 'engine': 'neutts_air', 

238 'transient': False, 

239 } 

240 cached = (ref_codes, ref_text) 

241 cache[voice] = cached 

242 state['ref_cache'] = cache 

243 ref_codes, ref_text = cached 

244 

245 try: 

246 wav = state['model'].infer(text, ref_codes, ref_text) 

247 except Exception as e: 

248 # Surface as transient ONLY for likely-recoverable error modes 

249 # (CUDA OOM, runtime allocation). Default to non-transient so 

250 # the TTS ladder doesn't waste cycles re-trying neutts on a 

251 # deterministic failure (bad weights, missing codec, etc.). 

252 msg = str(e).lower() 

253 transient = any(t in msg for t in ( 

254 'out of memory', 'cuda', 'device-side assert', 

255 )) 

256 return { 

257 'error': f"{type(e).__name__}: {e}", 

258 'engine': 'neutts_air', 

259 'transient': transient, 

260 } 

261 

262 # Write WAV via soundfile — required dep listed in pip_install_plan. 

263 try: 

264 import numpy as np 

265 import soundfile as sf 

266 except ImportError as e: 

267 return { 

268 'error': f"required dep missing: {e}", 

269 'engine': 'neutts_air', 

270 'transient': False, 

271 } 

272 

273 try: 

274 arr = np.asarray(wav) 

275 sf.write(output_path, arr, sample_rate) 

276 duration = len(arr) / sample_rate 

277 except Exception as e: 

278 return { 

279 'error': f"WAV write failed: {type(e).__name__}: {e}", 

280 'engine': 'neutts_air', 

281 'transient': False, 

282 } 

283 

284 return { 

285 'path': output_path, 

286 'duration': round(float(duration), 2), 

287 'sample_rate': sample_rate, 

288 'engine': 'neutts_air', 

289 'device': state.get('device', 'cpu'), 

290 'voice': voice, 

291 } 

292 

293 

294# ═══════════════════════════════════════════════════════════════ 

295# Parent-side: one ToolWorker instance + canonical public functions 

296# ═══════════════════════════════════════════════════════════════ 

297 

298# NeuTTS Air on CPU produces RTF<0.5 → for a 10-word utterance, the 

299# subprocess needs ~3-5s warm + ~1-2s synth. Q4 GGUF model load is 

300# the slow part; once loaded subsequent calls are quick. Match the 

301# kokoro / chatterbox shape for budgets. 

302_tool = ToolWorker( 

303 tool_name='neutts_air', 

304 tool_module='integrations.service_tools.neutts_tool', 

305 vram_budget='tts_neutts', 

306 output_subdir='neutts/output', 

307 engine='neutts-air', 

308 startup_timeout=120.0, # GGUF Q4 (~600MB) cold-start on CPU 

309 request_timeout=60.0, # CPU synth dominated by RTF, generous 

310) 

311 

312 

313def neutts_synthesize( 

314 text: str, 

315 language: str = 'en', 

316 voice: Optional[str] = None, 

317 output_path: Optional[str] = None, 

318) -> str: 

319 """Synthesize text to speech using NeuTTS Air (English only). 

320 

321 Forwards through ``_tool.synthesize`` which runs the actual model 

322 in a subprocess. On worker crash / model error the result JSON 

323 contains ``{error: ..., transient: bool}`` so the TTS ladder 

324 traverses past us to the next engine (kokoro / piper). 

325 

326 Args: 

327 text: Text to synthesize. 

328 language: ISO code — only 'en' is supported (NeuTTS Air is 

329 English-only). Accepted-and-ignored for ladder symmetry 

330 with multi-lang engines; the actual model has no 

331 language switch. 

332 voice: 'jo' (upstream sample, default), a path to a .wav 

333 (with companion .txt transcript), or a custom name in 

334 ``~/.hevolve/models/tts/neutts/voices/``. 

335 output_path: Optional output .wav path. Auto-generated under 

336 ``~/.hevolve/models/tts/neutts/output/`` when None. 

337 

338 Returns: 

339 JSON string — see ``_synthesize`` return shape. 

340 """ 

341 return _tool.synthesize( 

342 text=text, 

343 language='en', # NeuTTS is English-only 

344 voice=voice or 'jo', 

345 output_path=output_path, 

346 ) 

347 

348 

349def neutts_list_voices() -> str: 

350 """List available NeuTTS reference voices. 

351 

352 Inspects upstream-bundled samples + the user's persistent voices 

353 dir. No subprocess needed — reads filesystem only. 

354 """ 

355 voices = [] 

356 

357 # 1. Built-in upstream samples (only listable if neutts package 

358 # is installed AND its samples/ dir is present). 

359 try: 

360 import neutts # noqa: F401 

361 pkg_dir = Path(neutts.__file__).parent 

362 samples_dir = pkg_dir / 'samples' 

363 if samples_dir.is_dir(): 

364 for wav in sorted(samples_dir.glob('*.wav')): 

365 if (samples_dir / f"{wav.stem}.txt").is_file(): 

366 voices.append({ 

367 "id": wav.stem, 

368 "name": wav.stem.title(), 

369 "type": "builtin", 

370 "language": "en", 

371 }) 

372 except ImportError: 

373 pass 

374 

375 # 2. User-cloned voices 

376 voices_dir = _get_voices_dir() 

377 if voices_dir.is_dir(): 

378 for wav in sorted(voices_dir.glob('*.wav')): 

379 txt = voices_dir / f"{wav.stem}.txt" 

380 if txt.is_file(): 

381 voices.append({ 

382 "id": wav.stem, 

383 "name": wav.stem.title(), 

384 "type": "cloned", 

385 "language": "en", 

386 }) 

387 

388 # Engine availability (probes the parent — the venv-routed import 

389 # may not surface here; the subprocess's _load is the real probe). 

390 try: 

391 import neutts # noqa: F401 

392 engine = "neutts_air" 

393 except ImportError: 

394 engine = "none" 

395 

396 return json.dumps({"voices": voices, "engine": engine}) 

397 

398 

399def unload_neutts() -> None: 

400 """Stop the NeuTTS worker subprocess and free its memory.""" 

401 _tool.stop() 

402 

403 

404# ═══════════════════════════════════════════════════════════════ 

405# Service tool registration 

406# ═══════════════════════════════════════════════════════════════ 

407 

408class NeuTTSAirTool: 

409 """Register NeuTTS Air as an in-process service tool. 

410 

411 Same shape as KokoroTool — registers an entry in 

412 ``service_tool_registry`` so the catalog UI shows the engine. 

413 Synth itself goes through ``_tool.synthesize`` (subprocess). 

414 """ 

415 

416 @classmethod 

417 def register_functions(cls): 

418 """Register NeuTTS functions with service_tool_registry.""" 

419 tool_info = ServiceToolInfo( 

420 name="neutts_air", 

421 description=( 

422 "On-device English text-to-speech via NeuTTS Air " 

423 "(Neuphonic, Apache 2.0). 748M Qwen2-backbone, " 

424 "Q4 GGUF ~600MB, RTF<0.5 on CPU, 24kHz output. " 

425 "Instant voice cloning from 3-15s reference audio." 

426 ), 

427 base_url="inprocess://neutts_air", 

428 endpoints={ 

429 "synthesize": { 

430 "path": "/synthesize", 

431 "method": "POST", 

432 "description": ( 

433 "Convert text to speech audio. " 

434 "Input: text (string), voice (default 'jo' = " 

435 "upstream sample; also accepts path to .wav with " 

436 "companion .txt, or custom name from " 

437 "~/.hevolve/models/tts/neutts/voices/), " 

438 "output_path (optional). Returns JSON with audio " 

439 "file path and duration." 

440 ), 

441 "params_schema": { 

442 "text": {"type": "string", "description": "Text to speak"}, 

443 "voice": {"type": "string", "description": "Voice name or path (default: jo)"}, 

444 "output_path": {"type": "string", "description": "Output .wav path (optional)"}, 

445 }, 

446 }, 

447 "list_voices": { 

448 "path": "/voices", 

449 "method": "GET", 

450 "description": "List available NeuTTS voices.", 

451 "params_schema": {}, 

452 }, 

453 }, 

454 tags=["tts", "english", "voice_clone", "on_device"], 

455 ) 

456 service_tool_registry.register(tool_info) 

457 

458 

459# Auto-register on import (matches kokoro_tool / chatterbox_tool 

460# pattern). The registration is robust to neutts package absence — 

461# only synth subprocess calls fail with clean JSON; the catalog entry 

462# stays so the admin UI can offer "Install NeuTTS Air". 

463try: 

464 NeuTTSAirTool.register_functions() 

465except Exception as _reg_err: 

466 logger.debug(f"NeuTTS tool registration skipped: {_reg_err}") 

467 

468 

469# NOTE: no `if __name__ == '__main__':` block here. The centralized 

470# dispatcher at integrations.service_tools.gpu_worker imports this 

471# module and calls `_load` / `_synthesize` directly when spawned.