Coverage for integrations / service_tools / pocket_tts_tool.py: 70.6%
136 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2TTS tool — in-process text-to-speech via Pocket TTS (Kyutai).
4Pocket TTS benefits:
5 - 100M params — runs at 6x real-time on CPU (no GPU required)
6 - MIT license, fully open source
7 - Zero-shot voice cloning from 5 seconds of audio
8 - ~200ms latency for first audio chunk
9 - English (more languages planned by upstream)
10 - 100% local, zero cloud costs
12Model downloaded lazily on first use to ~/.hevolve/models/tts/
14Fallback: espeak-ng (if pocket-tts not installed).
16Public API:
17 pocket_tts_synthesize(text, voice, output_path) → JSON
18 pocket_tts_list_voices() → JSON
19 pocket_tts_clone_voice(audio_path, name) → JSON
20 unload_pocket_tts() → None
21"""
23import json
24import logging
25import os
26from pathlib import Path
27from typing import Optional
29from .registry import ServiceToolInfo, service_tool_registry
31logger = logging.getLogger(__name__)
33# ═══════════════════════════════════════════════════════════════
34# Built-in voices (shipped with pocket-tts)
35# ═══════════════════════════════════════════════════════════════
37_BUILTIN_VOICES = [
38 "alba", "marius", "javert", "jean",
39 "fantine", "cosette", "eponine", "azelma",
40]
42# ═══════════════════════════════════════════════════════════════
43# Cached model (avoid reloading on every call)
44# ═══════════════════════════════════════════════════════════════
46_tts_model = None
47_voice_states = {} # voice_name -> voice_state cache
50# ═══════════════════════════════════════════════════════════════
51# Model management
52# ═══════════════════════════════════════════════════════════════
54def _get_tts_dir() -> Path:
55 """Get the TTS model/output storage directory."""
56 try:
57 from .model_storage import model_storage
58 tts_dir = model_storage.get_tool_dir("tts")
59 except (ImportError, Exception):
60 tts_dir = Path(os.path.expanduser("~/.hevolve/models/tts"))
61 tts_dir.mkdir(parents=True, exist_ok=True)
62 return tts_dir
65def _get_output_dir() -> Path:
66 """Get the audio output directory."""
67 out_dir = _get_tts_dir() / "output"
68 out_dir.mkdir(parents=True, exist_ok=True)
69 return out_dir
72def _get_custom_voices_dir() -> Path:
73 """Get the directory for user-cloned voice states."""
74 vdir = _get_tts_dir() / "voices"
75 vdir.mkdir(parents=True, exist_ok=True)
76 return vdir
79def _load_model():
80 """Load Pocket TTS model (lazy, cached)."""
81 global _tts_model
82 if _tts_model is not None:
83 return _tts_model
85 from pocket_tts import TTSModel
87 logger.info("Loading Pocket TTS model (100M params, CPU)...")
88 _tts_model = TTSModel.load_model()
89 logger.info("Pocket TTS model ready")
90 return _tts_model
93def _get_voice_state(voice: str):
94 """Get or create a cached voice state."""
95 if voice in _voice_states:
96 return _voice_states[voice]
98 model = _load_model()
100 # Check custom cloned voices first
101 custom_path = _get_custom_voices_dir() / f"{voice}.safetensors"
102 if custom_path.exists():
103 from safetensors.torch import load_file
104 state = load_file(str(custom_path))
105 _voice_states[voice] = state
106 logger.info(f"Loaded custom voice: {voice}")
107 return state
109 # Check if it's a path to an audio file (for ad-hoc cloning)
110 if os.path.isfile(voice):
111 state = model.get_state_for_audio_prompt(voice)
112 _voice_states[voice] = state
113 return state
115 # Built-in voice
116 state = model.get_state_for_audio_prompt(voice)
117 _voice_states[voice] = state
118 return state
121# ═══════════════════════════════════════════════════════════════
122# espeak-ng fallback (for systems without pocket-tts)
123# ═══════════════════════════════════════════════════════════════
125def _espeak_synthesize(text: str, output_path: str, voice: str = "en") -> bool:
126 """Fallback: use espeak-ng for basic TTS."""
127 import subprocess
128 try:
129 _kw = dict(capture_output=True, text=True, timeout=30)
130 if hasattr(subprocess, 'CREATE_NO_WINDOW'):
131 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW
132 result = subprocess.run(
133 ["espeak-ng", "-v", voice, "-w", output_path, text],
134 **_kw)
135 return result.returncode == 0
136 except (FileNotFoundError, subprocess.TimeoutExpired):
137 return False
140# ═══════════════════════════════════════════════════════════════
141# Public API
142# ═══════════════════════════════════════════════════════════════
144def pocket_tts_synthesize(
145 text: str,
146 voice: str = "alba",
147 output_path: Optional[str] = None,
148 sample_rate: Optional[int] = None,
149) -> str:
150 """Synthesize text to speech using Pocket TTS.
152 Tries pocket-tts first (high quality, CPU, 6x real-time),
153 falls back to espeak-ng (basic quality, always available on NixOS).
155 Args:
156 text: Text to synthesize.
157 voice: Voice name (built-in like 'alba', custom name, or path to .wav).
158 output_path: Optional output .wav path. Auto-generated if None.
159 sample_rate: Override sample rate (default: model's native rate).
161 Returns:
162 JSON string with 'path', 'duration', 'voice', 'engine' keys.
163 """
164 if not text or not text.strip():
165 return json.dumps({"error": "Text is required"})
167 if output_path is None:
168 import hashlib
169 h = hashlib.md5(f"{text[:50]}:{voice}".encode()).hexdigest()[:12]
170 output_path = str(_get_output_dir() / f"tts_{h}.wav")
172 # Try Pocket TTS (preferred)
173 import time as _time
174 _t0 = _time.monotonic()
175 try:
176 import numpy as np
177 model = _load_model()
178 voice_state = _get_voice_state(voice)
179 audio = model.generate_audio(voice_state, text)
181 sr = sample_rate or model.sample_rate
182 audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
184 import scipy.io.wavfile
185 scipy.io.wavfile.write(output_path, sr, audio_np)
187 duration = len(audio_np) / sr
188 # Success log — required for #88: previously there was no
189 # runtime confirmation that pocket_tts was actually synthesizing
190 # (only failure paths logged). Operators couldn't tell from
191 # langchain.log alone whether TTS was working or silently no-oping.
192 _elapsed_ms = int((_time.monotonic() - _t0) * 1000)
193 logger.info(
194 f"pocket_tts synthesized {len(text)}ch → {output_path} "
195 f"(sr={sr}Hz, dur={duration:.2f}s, voice={voice}, "
196 f"latency={_elapsed_ms}ms)"
197 )
198 return json.dumps({
199 "path": output_path,
200 "duration": round(duration, 2),
201 "sample_rate": sr,
202 "voice": voice,
203 "engine": "pocket-tts",
204 })
205 except ImportError:
206 logger.info("pocket-tts not installed, trying espeak-ng fallback")
207 except Exception as e:
208 logger.warning(f"Pocket TTS synthesis failed: {e}")
210 # Fallback: espeak-ng
211 if _espeak_synthesize(text, output_path, voice="en"):
212 _elapsed_ms = int((_time.monotonic() - _t0) * 1000)
213 logger.info(
214 f"pocket_tts(espeak fallback) synthesized {len(text)}ch → "
215 f"{output_path} (latency={_elapsed_ms}ms)"
216 )
217 return json.dumps({
218 "path": output_path,
219 "duration": 0, # espeak doesn't report duration
220 "voice": "en",
221 "engine": "espeak-ng",
222 })
224 return json.dumps({"error": "No TTS engine available (install pocket-tts or espeak-ng)"})
227def pocket_tts_list_voices() -> str:
228 """List available TTS voices.
230 Returns built-in voices plus any user-cloned voices.
232 Returns:
233 JSON string with 'voices' list and 'engine' info.
234 """
235 voices = []
237 # Built-in voices
238 for name in _BUILTIN_VOICES:
239 voices.append({
240 "id": name,
241 "name": name.title(),
242 "type": "builtin",
243 "language": "en",
244 })
246 # Custom cloned voices
247 custom_dir = _get_custom_voices_dir()
248 if custom_dir.exists():
249 for f in sorted(custom_dir.glob("*.safetensors")):
250 name = f.stem
251 if name not in _BUILTIN_VOICES:
252 voices.append({
253 "id": name,
254 "name": name.title(),
255 "type": "cloned",
256 "language": "en",
257 })
259 # Check which engine is available
260 engine = "none"
261 try:
262 import pocket_tts # noqa: F401
263 engine = "pocket-tts"
264 except ImportError:
265 try:
266 import subprocess
267 _kw2 = dict(capture_output=True, text=True, timeout=5)
268 if hasattr(subprocess, 'CREATE_NO_WINDOW'):
269 _kw2['creationflags'] = subprocess.CREATE_NO_WINDOW
270 r = subprocess.run(["espeak-ng", "--version"], **_kw2)
271 if r.returncode == 0:
272 engine = "espeak-ng"
273 except (FileNotFoundError, subprocess.TimeoutExpired):
274 pass
276 return json.dumps({
277 "voices": voices,
278 "count": len(voices),
279 "engine": engine,
280 "builtin_count": len(_BUILTIN_VOICES),
281 })
284def pocket_tts_clone_voice(audio_path: str, name: str) -> str:
285 """Clone a voice from an audio sample (5+ seconds recommended).
287 Extracts voice embedding from the audio and saves it for reuse.
288 Requires pocket-tts (no fallback for voice cloning).
290 Args:
291 audio_path: Path to .wav/.mp3 audio sample (5+ seconds of clear speech).
292 name: Name to save the cloned voice as.
294 Returns:
295 JSON string with 'cloned', 'name', 'path' keys.
296 """
297 if not audio_path or not os.path.isfile(audio_path):
298 return json.dumps({"error": "Valid audio_path required"})
299 if not name or not name.strip():
300 return json.dumps({"error": "Voice name required"})
302 name = name.strip().lower().replace(" ", "-")
304 try:
305 model = _load_model()
306 voice_state = model.get_state_for_audio_prompt(audio_path)
308 # Export voice embedding for fast loading (not full model weights)
309 save_path = _get_custom_voices_dir() / f"{name}.safetensors"
310 from safetensors.torch import save_file
311 if isinstance(voice_state, dict):
312 save_file(voice_state, str(save_path))
313 else:
314 save_file({"voice_embedding": voice_state}, str(save_path))
316 # Cache it
317 _voice_states[name] = voice_state
318 logger.info(f"Voice cloned: {name} from {audio_path}")
320 return json.dumps({
321 "cloned": True,
322 "name": name,
323 "path": str(save_path),
324 })
325 except ImportError:
326 return json.dumps({"error": "pocket-tts required for voice cloning"})
327 except Exception as e:
328 return json.dumps({"error": f"Voice cloning failed: {e}"})
331def unload_pocket_tts():
332 """Unload Pocket TTS model to free memory."""
333 global _tts_model, _voice_states
334 _tts_model = None
335 _voice_states.clear()
337 from .vram_manager import clear_cuda_cache
338 clear_cuda_cache()
340 import gc
341 gc.collect()
342 logger.info("Pocket TTS model unloaded")
345# ═══════════════════════════════════════════════════════════════
346# Service tool registration
347# ═══════════════════════════════════════════════════════════════
349class PocketTTSTool:
350 """Register Pocket TTS as an in-process service tool.
352 Like WhisperTool, runs in-process (no sidecar server).
353 Functions are registered directly as callables.
354 """
356 @classmethod
357 def register_functions(cls):
358 """Register TTS functions with service_tool_registry."""
359 tool_info = ServiceToolInfo(
360 name="pocket_tts",
361 description=(
362 "Offline text-to-speech via Pocket TTS (Kyutai). "
363 "100M params, 6x real-time on CPU, zero-shot voice cloning "
364 "from 5s audio. Falls back to espeak-ng if unavailable. "
365 "MIT license, 100% local, zero cloud costs."
366 ),
367 base_url="inprocess://pocket_tts",
368 endpoints={
369 "synthesize": {
370 "path": "/synthesize",
371 "method": "POST",
372 "description": (
373 "Convert text to speech audio. "
374 "Input: text (string), voice (optional voice name, "
375 "default 'alba'), output_path (optional .wav path). "
376 "Returns JSON with audio file path and duration."
377 ),
378 "params_schema": {
379 "text": {"type": "string", "description": "Text to speak"},
380 "voice": {"type": "string", "description": "Voice name or .wav path (default: alba)"},
381 "output_path": {"type": "string", "description": "Output .wav path (optional)"},
382 },
383 },
384 "list_voices": {
385 "path": "/voices",
386 "method": "GET",
387 "description": "List available TTS voices (built-in + cloned).",
388 "params_schema": {},
389 },
390 "clone_voice": {
391 "path": "/clone",
392 "method": "POST",
393 "description": (
394 "Clone a voice from an audio sample. "
395 "Input: audio_path (path to .wav), name (voice name to save). "
396 "Requires 5+ seconds of clear speech."
397 ),
398 "params_schema": {
399 "audio_path": {"type": "string", "description": "Path to audio sample"},
400 "name": {"type": "string", "description": "Name for the cloned voice"},
401 },
402 },
403 },
404 health_endpoint="/health",
405 tags=["tts", "speech", "synthesis", "voice", "offline", "pocket-tts"],
406 timeout=60,
407 )
408 tool_info.is_healthy = True
409 service_tool_registry._tools["pocket_tts"] = tool_info
410 return True