Coverage for integrations/service_tools/neutts

1"""TTS tool — text-to-speech via NeuTTS Air (Neuphonic).

3NeuTTS Air benefits:

4 - 748M params on Qwen2 backbone, GGUF Q4 (~600MB) / Q8 (~800MB)

5 - Apache 2.0, fully open source

6 - On-device: RTF<0.5 on CPU, 24kHz output

7 - Instant voice cloning from 3-15s reference audio

8 - English primary; CPU-friendly (no GPU required for Q4)

10Pip package: ``neutts`` on PyPI. Optional extras: ``neutts[all]`` pulls

11``llama-cpp-python`` (GGUF inference) + ``soundfile`` + ``onnxruntime``

12(codec decoder). License Apache-2.0.

14Architecture (ToolWorker pattern — same as kokoro / chatterbox / f5):

15 - This module exposes ``_load`` + ``_synthesize`` (subprocess-side

16 callbacks) and ``_tool`` (parent-side ``ToolWorker`` instance).

17 - ``neutts_synthesize`` (the canonical public entry point referenced

18 by ``tts_router.ENGINE_REGISTRY['neutts_air'].tool_function``)

19 forwards through ``_tool.synthesize``.

20 - The desktop installer (Nunba) routes ``neutts`` into a dedicated

21 venv (``install_target='venv'`` in tts_router.py) because

22 ``neutts[all]`` pulls ``llama-cpp-python`` whose torch / numpy

23 pins can drift from the main interpreter. ``ToolWorker``'s

24 ``python_exe`` is wired to the venv's python at install time so

25 the synth subprocess sees the pinned neutts deps.

27Reference voices (NeuTTS requires a reference audio + transcript for

28cloning — there is no zero-config default voice the way pocket_tts has

29built-in 'alba'/'jean'/etc.). Resolution order:

30 1. Path to a .wav (with companion .txt at the same stem) — ad-hoc

31 2. Custom name → ``~/.hevolve/models/tts/neutts/voices/<name>.wav``

32 (with companion .txt) — persistent user-cloned voices

33 3. ``'jo'`` (default) → upstream sample at

34 ``<site-packages>/neutts/samples/jo.{wav,txt}``

36Model downloaded lazily on first use (HuggingFace

37``neuphonic/neutts-air-q4-gguf`` backbone + ``neuphonic/neucodec``

38codec). Env overrides:

39 - ``NEUTTS_BACKBONE_REPO`` (default ``neuphonic/neutts-air-q4-gguf``)

40 - ``NEUTTS_BACKBONE_DEVICE`` (default ``cpu``)

41 - ``NEUTTS_CODEC_REPO`` (default ``neuphonic/neucodec``)

42 - ``NEUTTS_CODEC_DEVICE`` (default ``cpu``)

44Public API (parent side):

45 neutts_synthesize(text, voice, output_path, language) -> JSON

46 neutts_list_voices() -> JSON

47 unload_neutts() -> None

48"""

50import json

51import logging

52import os

53from pathlib import Path

54from typing import Optional, Tuple

56from integrations.service_tools.gpu_worker import ToolWorker

58from .registry import ServiceToolInfo, service_tool_registry

60logger = logging.getLogger(__name__)

63# ═══════════════════════════════════════════════════════════════

64# Storage paths (parent + subprocess use the same resolver — single

65# source of truth for "where do user-cloned voice files live?")

66# ═══════════════════════════════════════════════════════════════

68def _get_tts_dir() -> Path:

69 """Get the NeuTTS storage directory."""

70 try:

71 from .model_storage import model_storage

72 tts_dir = model_storage.get_tool_dir("neutts")

73 except (ImportError, Exception):

74 tts_dir = Path(os.path.expanduser("~/.hevolve/models/tts/neutts"))

75 tts_dir.mkdir(parents=True, exist_ok=True)

76 return tts_dir

79def _get_output_dir() -> Path:

80 """Get the audio output directory."""

81 out_dir = _get_tts_dir() / "output"

82 out_dir.mkdir(parents=True, exist_ok=True)

83 return out_dir

86def _get_voices_dir() -> Path:

87 """Get the directory for user-cloned voice references."""

88 vdir = _get_tts_dir() / "voices"

89 vdir.mkdir(parents=True, exist_ok=True)

90 return vdir

93def _resolve_reference(voice: str) -> Tuple[Optional[str], Optional[str]]:

94 """Resolve a `voice` argument to (ref_audio_path, ref_text).

96 Resolution order:

97 1. Path to a .wav file (companion .txt at same stem for transcript)

98 2. Custom voice name → ~/.hevolve/models/tts/neutts/voices/{name}.wav

99 3. Built-in name 'jo' → upstream sample shipped with neutts package

100 4. Anything else → (None, None) — caller MUST treat as failure

101 """

102 # 1. Direct path

103 if voice and os.path.isfile(voice):

104 wav = voice

105 txt_path = os.path.splitext(voice)[0] + '.txt'

106 if os.path.isfile(txt_path):

107 with open(txt_path, encoding='utf-8') as fp:

108 return wav, fp.read().strip()

109 # No transcript — refuse rather than guess

110 return None, None

111

112 # 2. Custom user-cloned voice in our voices dir

113 custom_wav = _get_voices_dir() / f"{voice}.wav"

114 custom_txt = _get_voices_dir() / f"{voice}.txt"

115 if custom_wav.is_file() and custom_txt.is_file():

116 with open(custom_txt, encoding='utf-8') as fp:

117 return str(custom_wav), fp.read().strip()

118

119 # 3. Upstream sample 'jo' shipped with the neutts package

120 if voice == 'jo':

121 try:

122 import neutts # noqa: F401

123 pkg_dir = Path(neutts.__file__).parent

124 sample_wav = pkg_dir / 'samples' / 'jo.wav'

125 sample_txt = pkg_dir / 'samples' / 'jo.txt'

126 if sample_wav.is_file() and sample_txt.is_file():

127 with open(sample_txt, encoding='utf-8') as fp:

128 return str(sample_wav), fp.read().strip()

129 except ImportError:

130 return None, None

131

132 return None, None

133

134

135# ═══════════════════════════════════════════════════════════════

136# Subprocess-side callbacks (invoked by gpu_worker dispatcher)

137# ═══════════════════════════════════════════════════════════════

138

139def _load() -> dict:

140 """Load NeuTTS Air once at subprocess startup.

141

142 Default device is CPU because NeuTTS's Q4 GGUF runs at <0.5x RTF

143 on a modest consumer CPU per the upstream README — we don't burn

144 GPU for an engine the CPU can already serve in real time. Users

145 on big GPUs can override via ``NEUTTS_BACKBONE_DEVICE=cuda``.

146

147 Raises:

148 ImportError: if the `neutts` package isn't installed.

149 ToolWorker propagates this to the parent, which receives

150 an `{error: ..., transient: false}` JSON and the TTS

151 ladder traverses past us.

152 """

153 try:

154 from neutts import NeuTTS # type: ignore

155 except ImportError as e:

156 raise ImportError(

157 f"neutts package not installed. "

158 f"Install with: pip install neutts[all] ({e})"

159 ) from e

160

161 backbone_device = os.environ.get('NEUTTS_BACKBONE_DEVICE', 'cpu')

162 codec_device = os.environ.get('NEUTTS_CODEC_DEVICE', 'cpu')

163 backbone_repo = os.environ.get(

164 'NEUTTS_BACKBONE_REPO', 'neuphonic/neutts-air-q4-gguf')

165 codec_repo = os.environ.get('NEUTTS_CODEC_REPO', 'neuphonic/neucodec')

166

167 logger.info(

168 "Loading NeuTTS Air (backbone=%s on %s, codec=%s on %s)...",

169 backbone_repo, backbone_device, codec_repo, codec_device,

170 )

171 model = NeuTTS(

172 backbone_repo=backbone_repo,

173 backbone_device=backbone_device,

174 codec_repo=codec_repo,

175 codec_device=codec_device,

176 )

177 logger.info("NeuTTS Air ready")

178 return {

179 'model': model,

180 'device': backbone_device,

181 # Reference codes are expensive to compute (run the codec

182 # encoder over the wav). Cache per-voice for the life of the

183 # subprocess so consecutive synth calls with the same voice

184 # share the encode cost.

185 'ref_cache': {},

186 }

187

188

189def _synthesize(state, req: dict) -> dict:

190 """Run one synthesis request inside the worker.

191

192 Args:

193 state: dict returned by ``_load`` — holds the loaded model and

194 the per-subprocess ref-codes cache.

195 req: ``{text, voice, output_path, sample_rate?}`` request.

196

197 Returns:

198 ``{path, duration, sample_rate, voice, engine}`` on success or

199 ``{error, engine, transient}`` on failure. ``transient=False``

200 for "voice not configured" (deterministic — same input retries

201 will fail the same way) and for missing-package errors.

202 """

203 text = req.get('text', '')

204 if not text or not text.strip():

205 return {'error': 'Text is required', 'engine': 'neutts_air'}

206

207 output_path = req.get('output_path')

208 if not output_path:

209 return {'error': 'output_path is required', 'engine': 'neutts_air'}

210

211 voice = req.get('voice') or 'jo'

212 sample_rate = int(req.get('sample_rate') or 24000)

213

214 # Resolve + cache the reference codes (codec encoding is the slow

215 # part; one-time cost per voice per subprocess).

216 cache = state.get('ref_cache', {})

217 cached = cache.get(voice)

218 if cached is None:

219 ref_wav, ref_text = _resolve_reference(voice)

220 if not ref_wav or not ref_text:

221 return {

222 'error': (

223 f"NeuTTS voice {voice!r} not configured (no reference "

224 f"audio + transcript found). Provide a .wav with "

225 f"companion .txt at the same stem, or use the "

226 f"upstream 'jo' sample after installing the neutts "

227 f"package."

228 ),

229 'engine': 'neutts_air',

230 'transient': False,

231 }

232 try:

233 ref_codes = state['model'].encode_reference(ref_wav)

234 except Exception as e:

235 return {

236 'error': f"Reference encode failed: {type(e).__name__}: {e}",

237 'engine': 'neutts_air',

238 'transient': False,

239 }

240 cached = (ref_codes, ref_text)

241 cache[voice] = cached

242 state['ref_cache'] = cache

243 ref_codes, ref_text = cached

244

245 try:

246 wav = state['model'].infer(text, ref_codes, ref_text)

247 except Exception as e:

248 # Surface as transient ONLY for likely-recoverable error modes

249 # (CUDA OOM, runtime allocation). Default to non-transient so

250 # the TTS ladder doesn't waste cycles re-trying neutts on a

251 # deterministic failure (bad weights, missing codec, etc.).

252 msg = str(e).lower()

253 transient = any(t in msg for t in (

254 'out of memory', 'cuda', 'device-side assert',

255 ))

256 return {

257 'error': f"{type(e).__name__}: {e}",

258 'engine': 'neutts_air',

259 'transient': transient,

260 }

261

262 # Write WAV via soundfile — required dep listed in pip_install_plan.

263 try:

264 import numpy as np

265 import soundfile as sf

266 except ImportError as e:

267 return {

268 'error': f"required dep missing: {e}",

269 'engine': 'neutts_air',

270 'transient': False,

271 }

272

273 try:

274 arr = np.asarray(wav)

275 sf.write(output_path, arr, sample_rate)

276 duration = len(arr) / sample_rate

277 except Exception as e:

278 return {

279 'error': f"WAV write failed: {type(e).__name__}: {e}",

280 'engine': 'neutts_air',

281 'transient': False,

282 }

283

284 return {

285 'path': output_path,

286 'duration': round(float(duration), 2),

287 'sample_rate': sample_rate,

288 'engine': 'neutts_air',

289 'device': state.get('device', 'cpu'),

290 'voice': voice,

291 }

292

293

294# ═══════════════════════════════════════════════════════════════

295# Parent-side: one ToolWorker instance + canonical public functions

296# ═══════════════════════════════════════════════════════════════

297

298# NeuTTS Air on CPU produces RTF<0.5 → for a 10-word utterance, the

299# subprocess needs ~3-5s warm + ~1-2s synth. Q4 GGUF model load is

300# the slow part; once loaded subsequent calls are quick. Match the

301# kokoro / chatterbox shape for budgets.

302_tool = ToolWorker(

303 tool_name='neutts_air',

304 tool_module='integrations.service_tools.neutts_tool',

305 vram_budget='tts_neutts',

306 output_subdir='neutts/output',

307 engine='neutts-air',

308 startup_timeout=120.0, # GGUF Q4 (~600MB) cold-start on CPU

309 request_timeout=60.0, # CPU synth dominated by RTF, generous

310)

311

312

313def neutts_synthesize(

314 text: str,

315 language: str = 'en',

316 voice: Optional[str] = None,

317 output_path: Optional[str] = None,

318) -> str:

319 """Synthesize text to speech using NeuTTS Air (English only).

320

321 Forwards through ``_tool.synthesize`` which runs the actual model

322 in a subprocess. On worker crash / model error the result JSON

323 contains ``{error: ..., transient: bool}`` so the TTS ladder

324 traverses past us to the next engine (kokoro / piper).

325

326 Args:

327 text: Text to synthesize.

328 language: ISO code — only 'en' is supported (NeuTTS Air is

329 English-only). Accepted-and-ignored for ladder symmetry

330 with multi-lang engines; the actual model has no

331 language switch.

332 voice: 'jo' (upstream sample, default), a path to a .wav

333 (with companion .txt transcript), or a custom name in

334 ``~/.hevolve/models/tts/neutts/voices/``.

335 output_path: Optional output .wav path. Auto-generated under

336 ``~/.hevolve/models/tts/neutts/output/`` when None.

337

338 Returns:

339 JSON string — see ``_synthesize`` return shape.

340 """

341 return _tool.synthesize(

342 text=text,

343 language='en', # NeuTTS is English-only

344 voice=voice or 'jo',

345 output_path=output_path,

346 )

347

348

349def neutts_list_voices() -> str:

350 """List available NeuTTS reference voices.

351

352 Inspects upstream-bundled samples + the user's persistent voices

353 dir. No subprocess needed — reads filesystem only.

354 """

355 voices = []

356

357 # 1. Built-in upstream samples (only listable if neutts package

358 # is installed AND its samples/ dir is present).

359 try:

360 import neutts # noqa: F401

361 pkg_dir = Path(neutts.__file__).parent

362 samples_dir = pkg_dir / 'samples'

363 if samples_dir.is_dir():

364 for wav in sorted(samples_dir.glob('*.wav')):

365 if (samples_dir / f"{wav.stem}.txt").is_file():

366 voices.append({

367 "id": wav.stem,

368 "name": wav.stem.title(),

369 "type": "builtin",

370 "language": "en",

371 })

372 except ImportError:

373 pass

374

375 # 2. User-cloned voices

376 voices_dir = _get_voices_dir()

377 if voices_dir.is_dir():

378 for wav in sorted(voices_dir.glob('*.wav')):

379 txt = voices_dir / f"{wav.stem}.txt"

380 if txt.is_file():

381 voices.append({

382 "id": wav.stem,

383 "name": wav.stem.title(),

384 "type": "cloned",

385 "language": "en",

386 })

387

388 # Engine availability (probes the parent — the venv-routed import

389 # may not surface here; the subprocess's _load is the real probe).

390 try:

391 import neutts # noqa: F401

392 engine = "neutts_air"

393 except ImportError:

394 engine = "none"

395

396 return json.dumps({"voices": voices, "engine": engine})

397

398

399def unload_neutts() -> None:

400 """Stop the NeuTTS worker subprocess and free its memory."""

401 _tool.stop()

402

403

404# ═══════════════════════════════════════════════════════════════

405# Service tool registration

406# ═══════════════════════════════════════════════════════════════

407

408class NeuTTSAirTool:

409 """Register NeuTTS Air as an in-process service tool.

410

411 Same shape as KokoroTool — registers an entry in

412 ``service_tool_registry`` so the catalog UI shows the engine.

413 Synth itself goes through ``_tool.synthesize`` (subprocess).

414 """

415

416 @classmethod

417 def register_functions(cls):

418 """Register NeuTTS functions with service_tool_registry."""

419 tool_info = ServiceToolInfo(

420 name="neutts_air",

421 description=(

422 "On-device English text-to-speech via NeuTTS Air "

423 "(Neuphonic, Apache 2.0). 748M Qwen2-backbone, "

424 "Q4 GGUF ~600MB, RTF<0.5 on CPU, 24kHz output. "

425 "Instant voice cloning from 3-15s reference audio."

426 ),

427 base_url="inprocess://neutts_air",

428 endpoints={

429 "synthesize": {

430 "path": "/synthesize",

431 "method": "POST",

432 "description": (

433 "Convert text to speech audio. "

434 "Input: text (string), voice (default 'jo' = "

435 "upstream sample; also accepts path to .wav with "

436 "companion .txt, or custom name from "

437 "~/.hevolve/models/tts/neutts/voices/), "

438 "output_path (optional). Returns JSON with audio "

439 "file path and duration."

440 ),

441 "params_schema": {

442 "text": {"type": "string", "description": "Text to speak"},

443 "voice": {"type": "string", "description": "Voice name or path (default: jo)"},

444 "output_path": {"type": "string", "description": "Output .wav path (optional)"},

445 },

446 },

447 "list_voices": {

448 "path": "/voices",

449 "method": "GET",

450 "description": "List available NeuTTS voices.",

451 "params_schema": {},

452 },

453 },

454 tags=["tts", "english", "voice_clone", "on_device"],

455 )

456 service_tool_registry.register(tool_info)

457

458

459# Auto-register on import (matches kokoro_tool / chatterbox_tool

460# pattern). The registration is robust to neutts package absence —

461# only synth subprocess calls fail with clean JSON; the catalog entry

462# stays so the admin UI can offer "Install NeuTTS Air".

463try:

464 NeuTTSAirTool.register_functions()

465except Exception as _reg_err:

466 logger.debug(f"NeuTTS tool registration skipped: {_reg_err}")

467

468

469# NOTE: no `if __name__ == '__main__':` block here. The centralized

470# dispatcher at integrations.service_tools.gpu_worker imports this

471# module and calls `_load` / `_synthesize` directly when spawned.

Coverage for integrations / service_tools / neutts_tool.py: 0.0%

121 statements