Coverage for integrations / service_tools / neutts_tool.py: 0.0%
121 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""TTS tool — text-to-speech via NeuTTS Air (Neuphonic).
3NeuTTS Air benefits:
4 - 748M params on Qwen2 backbone, GGUF Q4 (~600MB) / Q8 (~800MB)
5 - Apache 2.0, fully open source
6 - On-device: RTF<0.5 on CPU, 24kHz output
7 - Instant voice cloning from 3-15s reference audio
8 - English primary; CPU-friendly (no GPU required for Q4)
10Pip package: ``neutts`` on PyPI. Optional extras: ``neutts[all]`` pulls
11``llama-cpp-python`` (GGUF inference) + ``soundfile`` + ``onnxruntime``
12(codec decoder). License Apache-2.0.
14Architecture (ToolWorker pattern — same as kokoro / chatterbox / f5):
15 - This module exposes ``_load`` + ``_synthesize`` (subprocess-side
16 callbacks) and ``_tool`` (parent-side ``ToolWorker`` instance).
17 - ``neutts_synthesize`` (the canonical public entry point referenced
18 by ``tts_router.ENGINE_REGISTRY['neutts_air'].tool_function``)
19 forwards through ``_tool.synthesize``.
20 - The desktop installer (Nunba) routes ``neutts`` into a dedicated
21 venv (``install_target='venv'`` in tts_router.py) because
22 ``neutts[all]`` pulls ``llama-cpp-python`` whose torch / numpy
23 pins can drift from the main interpreter. ``ToolWorker``'s
24 ``python_exe`` is wired to the venv's python at install time so
25 the synth subprocess sees the pinned neutts deps.
27Reference voices (NeuTTS requires a reference audio + transcript for
28cloning — there is no zero-config default voice the way pocket_tts has
29built-in 'alba'/'jean'/etc.). Resolution order:
30 1. Path to a .wav (with companion .txt at the same stem) — ad-hoc
31 2. Custom name → ``~/.hevolve/models/tts/neutts/voices/<name>.wav``
32 (with companion .txt) — persistent user-cloned voices
33 3. ``'jo'`` (default) → upstream sample at
34 ``<site-packages>/neutts/samples/jo.{wav,txt}``
36Model downloaded lazily on first use (HuggingFace
37``neuphonic/neutts-air-q4-gguf`` backbone + ``neuphonic/neucodec``
38codec). Env overrides:
39 - ``NEUTTS_BACKBONE_REPO`` (default ``neuphonic/neutts-air-q4-gguf``)
40 - ``NEUTTS_BACKBONE_DEVICE`` (default ``cpu``)
41 - ``NEUTTS_CODEC_REPO`` (default ``neuphonic/neucodec``)
42 - ``NEUTTS_CODEC_DEVICE`` (default ``cpu``)
44Public API (parent side):
45 neutts_synthesize(text, voice, output_path, language) -> JSON
46 neutts_list_voices() -> JSON
47 unload_neutts() -> None
48"""
50import json
51import logging
52import os
53from pathlib import Path
54from typing import Optional, Tuple
56from integrations.service_tools.gpu_worker import ToolWorker
58from .registry import ServiceToolInfo, service_tool_registry
60logger = logging.getLogger(__name__)
63# ═══════════════════════════════════════════════════════════════
64# Storage paths (parent + subprocess use the same resolver — single
65# source of truth for "where do user-cloned voice files live?")
66# ═══════════════════════════════════════════════════════════════
68def _get_tts_dir() -> Path:
69 """Get the NeuTTS storage directory."""
70 try:
71 from .model_storage import model_storage
72 tts_dir = model_storage.get_tool_dir("neutts")
73 except (ImportError, Exception):
74 tts_dir = Path(os.path.expanduser("~/.hevolve/models/tts/neutts"))
75 tts_dir.mkdir(parents=True, exist_ok=True)
76 return tts_dir
79def _get_output_dir() -> Path:
80 """Get the audio output directory."""
81 out_dir = _get_tts_dir() / "output"
82 out_dir.mkdir(parents=True, exist_ok=True)
83 return out_dir
86def _get_voices_dir() -> Path:
87 """Get the directory for user-cloned voice references."""
88 vdir = _get_tts_dir() / "voices"
89 vdir.mkdir(parents=True, exist_ok=True)
90 return vdir
93def _resolve_reference(voice: str) -> Tuple[Optional[str], Optional[str]]:
94 """Resolve a `voice` argument to (ref_audio_path, ref_text).
96 Resolution order:
97 1. Path to a .wav file (companion .txt at same stem for transcript)
98 2. Custom voice name → ~/.hevolve/models/tts/neutts/voices/{name}.wav
99 3. Built-in name 'jo' → upstream sample shipped with neutts package
100 4. Anything else → (None, None) — caller MUST treat as failure
101 """
102 # 1. Direct path
103 if voice and os.path.isfile(voice):
104 wav = voice
105 txt_path = os.path.splitext(voice)[0] + '.txt'
106 if os.path.isfile(txt_path):
107 with open(txt_path, encoding='utf-8') as fp:
108 return wav, fp.read().strip()
109 # No transcript — refuse rather than guess
110 return None, None
112 # 2. Custom user-cloned voice in our voices dir
113 custom_wav = _get_voices_dir() / f"{voice}.wav"
114 custom_txt = _get_voices_dir() / f"{voice}.txt"
115 if custom_wav.is_file() and custom_txt.is_file():
116 with open(custom_txt, encoding='utf-8') as fp:
117 return str(custom_wav), fp.read().strip()
119 # 3. Upstream sample 'jo' shipped with the neutts package
120 if voice == 'jo':
121 try:
122 import neutts # noqa: F401
123 pkg_dir = Path(neutts.__file__).parent
124 sample_wav = pkg_dir / 'samples' / 'jo.wav'
125 sample_txt = pkg_dir / 'samples' / 'jo.txt'
126 if sample_wav.is_file() and sample_txt.is_file():
127 with open(sample_txt, encoding='utf-8') as fp:
128 return str(sample_wav), fp.read().strip()
129 except ImportError:
130 return None, None
132 return None, None
135# ═══════════════════════════════════════════════════════════════
136# Subprocess-side callbacks (invoked by gpu_worker dispatcher)
137# ═══════════════════════════════════════════════════════════════
139def _load() -> dict:
140 """Load NeuTTS Air once at subprocess startup.
142 Default device is CPU because NeuTTS's Q4 GGUF runs at <0.5x RTF
143 on a modest consumer CPU per the upstream README — we don't burn
144 GPU for an engine the CPU can already serve in real time. Users
145 on big GPUs can override via ``NEUTTS_BACKBONE_DEVICE=cuda``.
147 Raises:
148 ImportError: if the `neutts` package isn't installed.
149 ToolWorker propagates this to the parent, which receives
150 an `{error: ..., transient: false}` JSON and the TTS
151 ladder traverses past us.
152 """
153 try:
154 from neutts import NeuTTS # type: ignore
155 except ImportError as e:
156 raise ImportError(
157 f"neutts package not installed. "
158 f"Install with: pip install neutts[all] ({e})"
159 ) from e
161 backbone_device = os.environ.get('NEUTTS_BACKBONE_DEVICE', 'cpu')
162 codec_device = os.environ.get('NEUTTS_CODEC_DEVICE', 'cpu')
163 backbone_repo = os.environ.get(
164 'NEUTTS_BACKBONE_REPO', 'neuphonic/neutts-air-q4-gguf')
165 codec_repo = os.environ.get('NEUTTS_CODEC_REPO', 'neuphonic/neucodec')
167 logger.info(
168 "Loading NeuTTS Air (backbone=%s on %s, codec=%s on %s)...",
169 backbone_repo, backbone_device, codec_repo, codec_device,
170 )
171 model = NeuTTS(
172 backbone_repo=backbone_repo,
173 backbone_device=backbone_device,
174 codec_repo=codec_repo,
175 codec_device=codec_device,
176 )
177 logger.info("NeuTTS Air ready")
178 return {
179 'model': model,
180 'device': backbone_device,
181 # Reference codes are expensive to compute (run the codec
182 # encoder over the wav). Cache per-voice for the life of the
183 # subprocess so consecutive synth calls with the same voice
184 # share the encode cost.
185 'ref_cache': {},
186 }
189def _synthesize(state, req: dict) -> dict:
190 """Run one synthesis request inside the worker.
192 Args:
193 state: dict returned by ``_load`` — holds the loaded model and
194 the per-subprocess ref-codes cache.
195 req: ``{text, voice, output_path, sample_rate?}`` request.
197 Returns:
198 ``{path, duration, sample_rate, voice, engine}`` on success or
199 ``{error, engine, transient}`` on failure. ``transient=False``
200 for "voice not configured" (deterministic — same input retries
201 will fail the same way) and for missing-package errors.
202 """
203 text = req.get('text', '')
204 if not text or not text.strip():
205 return {'error': 'Text is required', 'engine': 'neutts_air'}
207 output_path = req.get('output_path')
208 if not output_path:
209 return {'error': 'output_path is required', 'engine': 'neutts_air'}
211 voice = req.get('voice') or 'jo'
212 sample_rate = int(req.get('sample_rate') or 24000)
214 # Resolve + cache the reference codes (codec encoding is the slow
215 # part; one-time cost per voice per subprocess).
216 cache = state.get('ref_cache', {})
217 cached = cache.get(voice)
218 if cached is None:
219 ref_wav, ref_text = _resolve_reference(voice)
220 if not ref_wav or not ref_text:
221 return {
222 'error': (
223 f"NeuTTS voice {voice!r} not configured (no reference "
224 f"audio + transcript found). Provide a .wav with "
225 f"companion .txt at the same stem, or use the "
226 f"upstream 'jo' sample after installing the neutts "
227 f"package."
228 ),
229 'engine': 'neutts_air',
230 'transient': False,
231 }
232 try:
233 ref_codes = state['model'].encode_reference(ref_wav)
234 except Exception as e:
235 return {
236 'error': f"Reference encode failed: {type(e).__name__}: {e}",
237 'engine': 'neutts_air',
238 'transient': False,
239 }
240 cached = (ref_codes, ref_text)
241 cache[voice] = cached
242 state['ref_cache'] = cache
243 ref_codes, ref_text = cached
245 try:
246 wav = state['model'].infer(text, ref_codes, ref_text)
247 except Exception as e:
248 # Surface as transient ONLY for likely-recoverable error modes
249 # (CUDA OOM, runtime allocation). Default to non-transient so
250 # the TTS ladder doesn't waste cycles re-trying neutts on a
251 # deterministic failure (bad weights, missing codec, etc.).
252 msg = str(e).lower()
253 transient = any(t in msg for t in (
254 'out of memory', 'cuda', 'device-side assert',
255 ))
256 return {
257 'error': f"{type(e).__name__}: {e}",
258 'engine': 'neutts_air',
259 'transient': transient,
260 }
262 # Write WAV via soundfile — required dep listed in pip_install_plan.
263 try:
264 import numpy as np
265 import soundfile as sf
266 except ImportError as e:
267 return {
268 'error': f"required dep missing: {e}",
269 'engine': 'neutts_air',
270 'transient': False,
271 }
273 try:
274 arr = np.asarray(wav)
275 sf.write(output_path, arr, sample_rate)
276 duration = len(arr) / sample_rate
277 except Exception as e:
278 return {
279 'error': f"WAV write failed: {type(e).__name__}: {e}",
280 'engine': 'neutts_air',
281 'transient': False,
282 }
284 return {
285 'path': output_path,
286 'duration': round(float(duration), 2),
287 'sample_rate': sample_rate,
288 'engine': 'neutts_air',
289 'device': state.get('device', 'cpu'),
290 'voice': voice,
291 }
294# ═══════════════════════════════════════════════════════════════
295# Parent-side: one ToolWorker instance + canonical public functions
296# ═══════════════════════════════════════════════════════════════
298# NeuTTS Air on CPU produces RTF<0.5 → for a 10-word utterance, the
299# subprocess needs ~3-5s warm + ~1-2s synth. Q4 GGUF model load is
300# the slow part; once loaded subsequent calls are quick. Match the
301# kokoro / chatterbox shape for budgets.
302_tool = ToolWorker(
303 tool_name='neutts_air',
304 tool_module='integrations.service_tools.neutts_tool',
305 vram_budget='tts_neutts',
306 output_subdir='neutts/output',
307 engine='neutts-air',
308 startup_timeout=120.0, # GGUF Q4 (~600MB) cold-start on CPU
309 request_timeout=60.0, # CPU synth dominated by RTF, generous
310)
313def neutts_synthesize(
314 text: str,
315 language: str = 'en',
316 voice: Optional[str] = None,
317 output_path: Optional[str] = None,
318) -> str:
319 """Synthesize text to speech using NeuTTS Air (English only).
321 Forwards through ``_tool.synthesize`` which runs the actual model
322 in a subprocess. On worker crash / model error the result JSON
323 contains ``{error: ..., transient: bool}`` so the TTS ladder
324 traverses past us to the next engine (kokoro / piper).
326 Args:
327 text: Text to synthesize.
328 language: ISO code — only 'en' is supported (NeuTTS Air is
329 English-only). Accepted-and-ignored for ladder symmetry
330 with multi-lang engines; the actual model has no
331 language switch.
332 voice: 'jo' (upstream sample, default), a path to a .wav
333 (with companion .txt transcript), or a custom name in
334 ``~/.hevolve/models/tts/neutts/voices/``.
335 output_path: Optional output .wav path. Auto-generated under
336 ``~/.hevolve/models/tts/neutts/output/`` when None.
338 Returns:
339 JSON string — see ``_synthesize`` return shape.
340 """
341 return _tool.synthesize(
342 text=text,
343 language='en', # NeuTTS is English-only
344 voice=voice or 'jo',
345 output_path=output_path,
346 )
349def neutts_list_voices() -> str:
350 """List available NeuTTS reference voices.
352 Inspects upstream-bundled samples + the user's persistent voices
353 dir. No subprocess needed — reads filesystem only.
354 """
355 voices = []
357 # 1. Built-in upstream samples (only listable if neutts package
358 # is installed AND its samples/ dir is present).
359 try:
360 import neutts # noqa: F401
361 pkg_dir = Path(neutts.__file__).parent
362 samples_dir = pkg_dir / 'samples'
363 if samples_dir.is_dir():
364 for wav in sorted(samples_dir.glob('*.wav')):
365 if (samples_dir / f"{wav.stem}.txt").is_file():
366 voices.append({
367 "id": wav.stem,
368 "name": wav.stem.title(),
369 "type": "builtin",
370 "language": "en",
371 })
372 except ImportError:
373 pass
375 # 2. User-cloned voices
376 voices_dir = _get_voices_dir()
377 if voices_dir.is_dir():
378 for wav in sorted(voices_dir.glob('*.wav')):
379 txt = voices_dir / f"{wav.stem}.txt"
380 if txt.is_file():
381 voices.append({
382 "id": wav.stem,
383 "name": wav.stem.title(),
384 "type": "cloned",
385 "language": "en",
386 })
388 # Engine availability (probes the parent — the venv-routed import
389 # may not surface here; the subprocess's _load is the real probe).
390 try:
391 import neutts # noqa: F401
392 engine = "neutts_air"
393 except ImportError:
394 engine = "none"
396 return json.dumps({"voices": voices, "engine": engine})
399def unload_neutts() -> None:
400 """Stop the NeuTTS worker subprocess and free its memory."""
401 _tool.stop()
404# ═══════════════════════════════════════════════════════════════
405# Service tool registration
406# ═══════════════════════════════════════════════════════════════
408class NeuTTSAirTool:
409 """Register NeuTTS Air as an in-process service tool.
411 Same shape as KokoroTool — registers an entry in
412 ``service_tool_registry`` so the catalog UI shows the engine.
413 Synth itself goes through ``_tool.synthesize`` (subprocess).
414 """
416 @classmethod
417 def register_functions(cls):
418 """Register NeuTTS functions with service_tool_registry."""
419 tool_info = ServiceToolInfo(
420 name="neutts_air",
421 description=(
422 "On-device English text-to-speech via NeuTTS Air "
423 "(Neuphonic, Apache 2.0). 748M Qwen2-backbone, "
424 "Q4 GGUF ~600MB, RTF<0.5 on CPU, 24kHz output. "
425 "Instant voice cloning from 3-15s reference audio."
426 ),
427 base_url="inprocess://neutts_air",
428 endpoints={
429 "synthesize": {
430 "path": "/synthesize",
431 "method": "POST",
432 "description": (
433 "Convert text to speech audio. "
434 "Input: text (string), voice (default 'jo' = "
435 "upstream sample; also accepts path to .wav with "
436 "companion .txt, or custom name from "
437 "~/.hevolve/models/tts/neutts/voices/), "
438 "output_path (optional). Returns JSON with audio "
439 "file path and duration."
440 ),
441 "params_schema": {
442 "text": {"type": "string", "description": "Text to speak"},
443 "voice": {"type": "string", "description": "Voice name or path (default: jo)"},
444 "output_path": {"type": "string", "description": "Output .wav path (optional)"},
445 },
446 },
447 "list_voices": {
448 "path": "/voices",
449 "method": "GET",
450 "description": "List available NeuTTS voices.",
451 "params_schema": {},
452 },
453 },
454 tags=["tts", "english", "voice_clone", "on_device"],
455 )
456 service_tool_registry.register(tool_info)
459# Auto-register on import (matches kokoro_tool / chatterbox_tool
460# pattern). The registration is robust to neutts package absence —
461# only synth subprocess calls fail with clean JSON; the catalog entry
462# stays so the admin UI can offer "Install NeuTTS Air".
463try:
464 NeuTTSAirTool.register_functions()
465except Exception as _reg_err:
466 logger.debug(f"NeuTTS tool registration skipped: {_reg_err}")
469# NOTE: no `if __name__ == '__main__':` block here. The centralized
470# dispatcher at integrations.service_tools.gpu_worker imports this
471# module and calls `_load` / `_synthesize` directly when spawned.