Coverage for integrations / service_tools / pocket_tts_tool.py: 70.6%

136 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2TTS tool — in-process text-to-speech via Pocket TTS (Kyutai). 

3 

4Pocket TTS benefits: 

5 - 100M params — runs at 6x real-time on CPU (no GPU required) 

6 - MIT license, fully open source 

7 - Zero-shot voice cloning from 5 seconds of audio 

8 - ~200ms latency for first audio chunk 

9 - English (more languages planned by upstream) 

10 - 100% local, zero cloud costs 

11 

12Model downloaded lazily on first use to ~/.hevolve/models/tts/ 

13 

14Fallback: espeak-ng (if pocket-tts not installed). 

15 

16Public API: 

17 pocket_tts_synthesize(text, voice, output_path) → JSON 

18 pocket_tts_list_voices() → JSON 

19 pocket_tts_clone_voice(audio_path, name) → JSON 

20 unload_pocket_tts() → None 

21""" 

22 

23import json 

24import logging 

25import os 

26from pathlib import Path 

27from typing import Optional 

28 

29from .registry import ServiceToolInfo, service_tool_registry 

30 

31logger = logging.getLogger(__name__) 

32 

33# ═══════════════════════════════════════════════════════════════ 

34# Built-in voices (shipped with pocket-tts) 

35# ═══════════════════════════════════════════════════════════════ 

36 

37_BUILTIN_VOICES = [ 

38 "alba", "marius", "javert", "jean", 

39 "fantine", "cosette", "eponine", "azelma", 

40] 

41 

42# ═══════════════════════════════════════════════════════════════ 

43# Cached model (avoid reloading on every call) 

44# ═══════════════════════════════════════════════════════════════ 

45 

46_tts_model = None 

47_voice_states = {} # voice_name -> voice_state cache 

48 

49 

50# ═══════════════════════════════════════════════════════════════ 

51# Model management 

52# ═══════════════════════════════════════════════════════════════ 

53 

54def _get_tts_dir() -> Path: 

55 """Get the TTS model/output storage directory.""" 

56 try: 

57 from .model_storage import model_storage 

58 tts_dir = model_storage.get_tool_dir("tts") 

59 except (ImportError, Exception): 

60 tts_dir = Path(os.path.expanduser("~/.hevolve/models/tts")) 

61 tts_dir.mkdir(parents=True, exist_ok=True) 

62 return tts_dir 

63 

64 

65def _get_output_dir() -> Path: 

66 """Get the audio output directory.""" 

67 out_dir = _get_tts_dir() / "output" 

68 out_dir.mkdir(parents=True, exist_ok=True) 

69 return out_dir 

70 

71 

72def _get_custom_voices_dir() -> Path: 

73 """Get the directory for user-cloned voice states.""" 

74 vdir = _get_tts_dir() / "voices" 

75 vdir.mkdir(parents=True, exist_ok=True) 

76 return vdir 

77 

78 

79def _load_model(): 

80 """Load Pocket TTS model (lazy, cached).""" 

81 global _tts_model 

82 if _tts_model is not None: 

83 return _tts_model 

84 

85 from pocket_tts import TTSModel 

86 

87 logger.info("Loading Pocket TTS model (100M params, CPU)...") 

88 _tts_model = TTSModel.load_model() 

89 logger.info("Pocket TTS model ready") 

90 return _tts_model 

91 

92 

93def _get_voice_state(voice: str): 

94 """Get or create a cached voice state.""" 

95 if voice in _voice_states: 

96 return _voice_states[voice] 

97 

98 model = _load_model() 

99 

100 # Check custom cloned voices first 

101 custom_path = _get_custom_voices_dir() / f"{voice}.safetensors" 

102 if custom_path.exists(): 

103 from safetensors.torch import load_file 

104 state = load_file(str(custom_path)) 

105 _voice_states[voice] = state 

106 logger.info(f"Loaded custom voice: {voice}") 

107 return state 

108 

109 # Check if it's a path to an audio file (for ad-hoc cloning) 

110 if os.path.isfile(voice): 

111 state = model.get_state_for_audio_prompt(voice) 

112 _voice_states[voice] = state 

113 return state 

114 

115 # Built-in voice 

116 state = model.get_state_for_audio_prompt(voice) 

117 _voice_states[voice] = state 

118 return state 

119 

120 

121# ═══════════════════════════════════════════════════════════════ 

122# espeak-ng fallback (for systems without pocket-tts) 

123# ═══════════════════════════════════════════════════════════════ 

124 

125def _espeak_synthesize(text: str, output_path: str, voice: str = "en") -> bool: 

126 """Fallback: use espeak-ng for basic TTS.""" 

127 import subprocess 

128 try: 

129 _kw = dict(capture_output=True, text=True, timeout=30) 

130 if hasattr(subprocess, 'CREATE_NO_WINDOW'): 

131 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW 

132 result = subprocess.run( 

133 ["espeak-ng", "-v", voice, "-w", output_path, text], 

134 **_kw) 

135 return result.returncode == 0 

136 except (FileNotFoundError, subprocess.TimeoutExpired): 

137 return False 

138 

139 

140# ═══════════════════════════════════════════════════════════════ 

141# Public API 

142# ═══════════════════════════════════════════════════════════════ 

143 

144def pocket_tts_synthesize( 

145 text: str, 

146 voice: str = "alba", 

147 output_path: Optional[str] = None, 

148 sample_rate: Optional[int] = None, 

149) -> str: 

150 """Synthesize text to speech using Pocket TTS. 

151 

152 Tries pocket-tts first (high quality, CPU, 6x real-time), 

153 falls back to espeak-ng (basic quality, always available on NixOS). 

154 

155 Args: 

156 text: Text to synthesize. 

157 voice: Voice name (built-in like 'alba', custom name, or path to .wav). 

158 output_path: Optional output .wav path. Auto-generated if None. 

159 sample_rate: Override sample rate (default: model's native rate). 

160 

161 Returns: 

162 JSON string with 'path', 'duration', 'voice', 'engine' keys. 

163 """ 

164 if not text or not text.strip(): 

165 return json.dumps({"error": "Text is required"}) 

166 

167 if output_path is None: 

168 import hashlib 

169 h = hashlib.md5(f"{text[:50]}:{voice}".encode()).hexdigest()[:12] 

170 output_path = str(_get_output_dir() / f"tts_{h}.wav") 

171 

172 # Try Pocket TTS (preferred) 

173 import time as _time 

174 _t0 = _time.monotonic() 

175 try: 

176 import numpy as np 

177 model = _load_model() 

178 voice_state = _get_voice_state(voice) 

179 audio = model.generate_audio(voice_state, text) 

180 

181 sr = sample_rate or model.sample_rate 

182 audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio 

183 

184 import scipy.io.wavfile 

185 scipy.io.wavfile.write(output_path, sr, audio_np) 

186 

187 duration = len(audio_np) / sr 

188 # Success log — required for #88: previously there was no 

189 # runtime confirmation that pocket_tts was actually synthesizing 

190 # (only failure paths logged). Operators couldn't tell from 

191 # langchain.log alone whether TTS was working or silently no-oping. 

192 _elapsed_ms = int((_time.monotonic() - _t0) * 1000) 

193 logger.info( 

194 f"pocket_tts synthesized {len(text)}ch → {output_path} " 

195 f"(sr={sr}Hz, dur={duration:.2f}s, voice={voice}, " 

196 f"latency={_elapsed_ms}ms)" 

197 ) 

198 return json.dumps({ 

199 "path": output_path, 

200 "duration": round(duration, 2), 

201 "sample_rate": sr, 

202 "voice": voice, 

203 "engine": "pocket-tts", 

204 }) 

205 except ImportError: 

206 logger.info("pocket-tts not installed, trying espeak-ng fallback") 

207 except Exception as e: 

208 logger.warning(f"Pocket TTS synthesis failed: {e}") 

209 

210 # Fallback: espeak-ng 

211 if _espeak_synthesize(text, output_path, voice="en"): 

212 _elapsed_ms = int((_time.monotonic() - _t0) * 1000) 

213 logger.info( 

214 f"pocket_tts(espeak fallback) synthesized {len(text)}ch → " 

215 f"{output_path} (latency={_elapsed_ms}ms)" 

216 ) 

217 return json.dumps({ 

218 "path": output_path, 

219 "duration": 0, # espeak doesn't report duration 

220 "voice": "en", 

221 "engine": "espeak-ng", 

222 }) 

223 

224 return json.dumps({"error": "No TTS engine available (install pocket-tts or espeak-ng)"}) 

225 

226 

227def pocket_tts_list_voices() -> str: 

228 """List available TTS voices. 

229 

230 Returns built-in voices plus any user-cloned voices. 

231 

232 Returns: 

233 JSON string with 'voices' list and 'engine' info. 

234 """ 

235 voices = [] 

236 

237 # Built-in voices 

238 for name in _BUILTIN_VOICES: 

239 voices.append({ 

240 "id": name, 

241 "name": name.title(), 

242 "type": "builtin", 

243 "language": "en", 

244 }) 

245 

246 # Custom cloned voices 

247 custom_dir = _get_custom_voices_dir() 

248 if custom_dir.exists(): 

249 for f in sorted(custom_dir.glob("*.safetensors")): 

250 name = f.stem 

251 if name not in _BUILTIN_VOICES: 

252 voices.append({ 

253 "id": name, 

254 "name": name.title(), 

255 "type": "cloned", 

256 "language": "en", 

257 }) 

258 

259 # Check which engine is available 

260 engine = "none" 

261 try: 

262 import pocket_tts # noqa: F401 

263 engine = "pocket-tts" 

264 except ImportError: 

265 try: 

266 import subprocess 

267 _kw2 = dict(capture_output=True, text=True, timeout=5) 

268 if hasattr(subprocess, 'CREATE_NO_WINDOW'): 

269 _kw2['creationflags'] = subprocess.CREATE_NO_WINDOW 

270 r = subprocess.run(["espeak-ng", "--version"], **_kw2) 

271 if r.returncode == 0: 

272 engine = "espeak-ng" 

273 except (FileNotFoundError, subprocess.TimeoutExpired): 

274 pass 

275 

276 return json.dumps({ 

277 "voices": voices, 

278 "count": len(voices), 

279 "engine": engine, 

280 "builtin_count": len(_BUILTIN_VOICES), 

281 }) 

282 

283 

284def pocket_tts_clone_voice(audio_path: str, name: str) -> str: 

285 """Clone a voice from an audio sample (5+ seconds recommended). 

286 

287 Extracts voice embedding from the audio and saves it for reuse. 

288 Requires pocket-tts (no fallback for voice cloning). 

289 

290 Args: 

291 audio_path: Path to .wav/.mp3 audio sample (5+ seconds of clear speech). 

292 name: Name to save the cloned voice as. 

293 

294 Returns: 

295 JSON string with 'cloned', 'name', 'path' keys. 

296 """ 

297 if not audio_path or not os.path.isfile(audio_path): 

298 return json.dumps({"error": "Valid audio_path required"}) 

299 if not name or not name.strip(): 

300 return json.dumps({"error": "Voice name required"}) 

301 

302 name = name.strip().lower().replace(" ", "-") 

303 

304 try: 

305 model = _load_model() 

306 voice_state = model.get_state_for_audio_prompt(audio_path) 

307 

308 # Export voice embedding for fast loading (not full model weights) 

309 save_path = _get_custom_voices_dir() / f"{name}.safetensors" 

310 from safetensors.torch import save_file 

311 if isinstance(voice_state, dict): 

312 save_file(voice_state, str(save_path)) 

313 else: 

314 save_file({"voice_embedding": voice_state}, str(save_path)) 

315 

316 # Cache it 

317 _voice_states[name] = voice_state 

318 logger.info(f"Voice cloned: {name} from {audio_path}") 

319 

320 return json.dumps({ 

321 "cloned": True, 

322 "name": name, 

323 "path": str(save_path), 

324 }) 

325 except ImportError: 

326 return json.dumps({"error": "pocket-tts required for voice cloning"}) 

327 except Exception as e: 

328 return json.dumps({"error": f"Voice cloning failed: {e}"}) 

329 

330 

331def unload_pocket_tts(): 

332 """Unload Pocket TTS model to free memory.""" 

333 global _tts_model, _voice_states 

334 _tts_model = None 

335 _voice_states.clear() 

336 

337 from .vram_manager import clear_cuda_cache 

338 clear_cuda_cache() 

339 

340 import gc 

341 gc.collect() 

342 logger.info("Pocket TTS model unloaded") 

343 

344 

345# ═══════════════════════════════════════════════════════════════ 

346# Service tool registration 

347# ═══════════════════════════════════════════════════════════════ 

348 

349class PocketTTSTool: 

350 """Register Pocket TTS as an in-process service tool. 

351 

352 Like WhisperTool, runs in-process (no sidecar server). 

353 Functions are registered directly as callables. 

354 """ 

355 

356 @classmethod 

357 def register_functions(cls): 

358 """Register TTS functions with service_tool_registry.""" 

359 tool_info = ServiceToolInfo( 

360 name="pocket_tts", 

361 description=( 

362 "Offline text-to-speech via Pocket TTS (Kyutai). " 

363 "100M params, 6x real-time on CPU, zero-shot voice cloning " 

364 "from 5s audio. Falls back to espeak-ng if unavailable. " 

365 "MIT license, 100% local, zero cloud costs." 

366 ), 

367 base_url="inprocess://pocket_tts", 

368 endpoints={ 

369 "synthesize": { 

370 "path": "/synthesize", 

371 "method": "POST", 

372 "description": ( 

373 "Convert text to speech audio. " 

374 "Input: text (string), voice (optional voice name, " 

375 "default 'alba'), output_path (optional .wav path). " 

376 "Returns JSON with audio file path and duration." 

377 ), 

378 "params_schema": { 

379 "text": {"type": "string", "description": "Text to speak"}, 

380 "voice": {"type": "string", "description": "Voice name or .wav path (default: alba)"}, 

381 "output_path": {"type": "string", "description": "Output .wav path (optional)"}, 

382 }, 

383 }, 

384 "list_voices": { 

385 "path": "/voices", 

386 "method": "GET", 

387 "description": "List available TTS voices (built-in + cloned).", 

388 "params_schema": {}, 

389 }, 

390 "clone_voice": { 

391 "path": "/clone", 

392 "method": "POST", 

393 "description": ( 

394 "Clone a voice from an audio sample. " 

395 "Input: audio_path (path to .wav), name (voice name to save). " 

396 "Requires 5+ seconds of clear speech." 

397 ), 

398 "params_schema": { 

399 "audio_path": {"type": "string", "description": "Path to audio sample"}, 

400 "name": {"type": "string", "description": "Name for the cloned voice"}, 

401 }, 

402 }, 

403 }, 

404 health_endpoint="/health", 

405 tags=["tts", "speech", "synthesis", "voice", "offline", "pocket-tts"], 

406 timeout=60, 

407 ) 

408 tool_info.is_healthy = True 

409 service_tool_registry._tools["pocket_tts"] = tool_info 

410 return True